# 1. Data Cleaning

### Webscraping, data cleaning, saving to pickle


In [1]:
import get_transcripts

# URLs til transcripts
urls = ['https://millercenter.org/the-presidency/presidential-speeches/february-5-2019-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-12-2016-2016-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-28-2008-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-27-2000-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-28-1992-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-25-1988-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-23-1980-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-30-1974-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-14-1969-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-14-1963-state-union-address']

# Her hentes transcripts og lægges i en liste
transcripts = [get_transcripts.url_to_transcript(u) for u in urls]

# Præsidenternes navne
presidents = ['trump', 'obama', 'bushjunior', 'clinton', 'bushsenior', 'reagan', 'carter', 'nixon', 'johnson', 'kennedy']

print(transcripts[0])

https://millercenter.org/the-presidency/presidential-speeches/february-5-2019-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-12-2016-2016-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-28-2008-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-27-2000-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-28-1992-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-25-1988-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-23-1980-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-30-1974-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-14-1969-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-14-1963-state-union-address
['Madam Speaker

In [2]:
import pickle
# Ny folder til at indeholde transcripts
# !mkdir transcripts

for i, p in enumerate(presidents):
    with open("transcripts/" + p + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

In [3]:
# Her hentes de picklede filer, og sættes ind i en dictionary med key: præsidentnavn, value: transcript
data = {}
for i, p in enumerate(presidents):
    with open("transcripts/" + p + ".txt", "rb") as file:
        data[p] = pickle.load(file)

In [4]:
# Checker at keys ser rigtigt ud
data.keys()

dict_keys(['trump', 'obama', 'bushjunior', 'clinton', 'bushsenior', 'reagan', 'carter', 'nixon', 'johnson', 'kennedy'])

In [5]:
# Her ser vi de første fire elementer af obamas tale
# I øjeblikket er transcripts'ene repræsenteret som en liste af strings
data['obama'][:4]

['Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans:',
 "Tonight marks the eighth year that I’ve come here to report on the State of the Union.\xa0And for this final one, I’m going to try to make it a little shorter. (Applause.) I know some of you are antsy to get back to Iowa. (Laughter.) I've been there. I'll be shaking hands afterwards if you want some tips. (Laughter.)",
 'And I understand that because it’s an election season, expectations for what we will achieve this year are low.\xa0But, Mr. Speaker, I appreciate the constructive approach that you and the other leaderstook at the end of last year to\xa0pass a budget\xa0and\xa0make\xa0tax cuts permanent\xa0for working families. So I hope we can work together this year on some bipartisan priorities like\xa0criminal justice reform\xa0-- (applause) -- and helping people who are battling prescription drug abuse and heroin abuse. (Applause.) So, who knows, we might surprise the cynics again.',
 "But tonight, I 

In [6]:
# Vi omdanner denne liste til én lang string med denne funktion
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [7]:
# Kalder combine_text for hver value i vores dictionary
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}
# Og så har vi en dictionary med key, value (præsident, transcript), hvor transcript er én string. Fx Trump:
data_combined['trump']

['Madam Speaker, Mr. Vice President, Members of Congress, the\xa0First Lady of the United States, and my fellow Americans: We meet tonight at a moment of unlimited potential.\xa0As we begin a new Congress, I stand here ready to work with you to achieve historic breakthroughs for all Americans. Millions of our fellow citizens are watching us now, gathered in this great chamber, hoping that we will govern not as two parties but as one Nation. The agenda I will lay out this evening is not a Republican agenda or a Democrat agenda.\xa0It is the agenda of the American people. Many of us campaigned on the same core promises:\xa0to defend American jobs and demand fair trade for American workers; to rebuild and revitalize our Nation’s infrastructure; to reduce the price of healthcare and prescription drugs; to create an immigration system that is safe, lawful, modern, and secure; and to pursue a foreign policy that puts America’s interests first. There is a new opportunity in American politics,

In [8]:
# Her laver vi vores dictionary (data_combined) om til en dataframe med pandas
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
bushjunior,"Madam Speaker, Vice President Cheney, members of Congress, distinguished guests, and fellow citizens: Seven years have passed since I first stood ..."
bushsenior,"Mr. Speaker and Mr. President, distinguished members of Congress, honored guests, and fellow citizens: Thank you very much for that warm reception..."
carter,"Mr. President, Mr. Speaker, members of the 96th Congress, fellow citizens: This last few months has not been an easy time for any of us. As we mee..."
clinton,"Mr. Speaker, Mr. Vice President, members of Congress, honored guests, my fellow Americans: We are fortunate to be alive at this moment in history...."
johnson,"Mr. Speaker, Mr. President, Members of the Congress and my fellow Americans For the sixth and the last time, I present to the Congress my assess..."
kennedy,"Mr. Vice President, Mr. Speaker, Members of the 88th Congress: I congratulate you all--not merely on your electoral victory but on your selected r..."
nixon,"Mr. Speaker, Mr. President, my colleagues in the Congress, our distinguished guests, my fellow Americans:\r\nWe meet here tonight at a time of gre..."
obama,"Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans: Tonight marks the eighth year that I’ve come here to report on the Stat..."
reagan,"Mr. Speaker, Mr. President, and distinguished Members of the House and Senate: When we first met here seven years ago-many of us for the first tim..."
trump,"Madam Speaker, Mr. Vice President, Members of Congress, the First Lady of the United States, and my fellow Americans: We meet tonight at a moment ..."


In [9]:
# Vi ser på et af vores transcripts
data_df.transcript.loc['trump']

'Madam Speaker, Mr. Vice President, Members of Congress, the\xa0First Lady of the United States, and my fellow Americans: We meet tonight at a moment of unlimited potential.\xa0As we begin a new Congress, I stand here ready to work with you to achieve historic breakthroughs for all Americans. Millions of our fellow citizens are watching us now, gathered in this great chamber, hoping that we will govern not as two parties but as one Nation. The agenda I will lay out this evening is not a Republican agenda or a Democrat agenda.\xa0It is the agenda of the American people. Many of us campaigned on the same core promises:\xa0to defend American jobs and demand fair trade for American workers; to rebuild and revitalize our Nation’s infrastructure; to reduce the price of healthcare and prescription drugs; to create an immigration system that is safe, lawful, modern, and secure; and to pursue a foreign policy that puts America’s interests first. There is a new opportunity in American politics, 

In [10]:
# Data cleaning første runde
import re
import string

def clean_text_round1(text):
    # lowercase:
    text = text.lower()
    # Fjerner tekst i square brackets
    text = re.sub('\[.*?\]', '', text)
    # Fjerner tekst i parentes
    text = re.sub('\(.*?\)', '', text)
    # Fjerner punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Fjerner ord med tal i
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [11]:
# På hvert transcript i vores dataframe, bliver round1 anvendt:
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
# Og vi ser hvordan et transcript ser ud efter første cleaning:
data_clean.transcript.loc['trump']

'madam speaker mr vice president members of congress the\xa0first lady of the united states and my fellow americans we meet tonight at a moment of unlimited potential\xa0as we begin a new congress i stand here ready to work with you to achieve historic breakthroughs for all americans millions of our fellow citizens are watching us now gathered in this great chamber hoping that we will govern not as two parties but as one nation the agenda i will lay out this evening is not a republican agenda or a democrat agenda\xa0it is the agenda of the american people many of us campaigned on the same core promises\xa0to defend american jobs and demand fair trade for american workers to rebuild and revitalize our nation’s infrastructure to reduce the price of healthcare and prescription drugs to create an immigration system that is safe lawful modern and secure and to pursue a foreign policy that puts america’s interests first there is a new opportunity in american politics if only we have the cour

In [12]:
# Endnu en runde cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('yearsand', 'years and', text)
    # fjerner ord med bindestreg
    text = re.sub('\w*\-\w*', '', text)
    # fjerner ekstra mellemrum
    text = re.sub('  ', ' ', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [13]:
# Og vi kigger på et transcript efter denne runde
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean.transcript.loc['trump']

'madam speaker mr vice president members of congress the\xa0first lady of the united states and my fellow americans we meet tonight at a moment of unlimited potential\xa0as we begin a new congress i stand here ready to work with you to achieve historic breakthroughs for all americans millions of our fellow citizens are watching us now gathered in this great chamber hoping that we will govern not as two parties but as one nation the agenda i will lay out this evening is not a republican agenda or a democrat agenda\xa0it is the agenda of the american people many of us campaigned on the same core promises\xa0to defend american jobs and demand fair trade for american workers to rebuild and revitalize our nations infrastructure to reduce the price of healthcare and prescription drugs to create an immigration system that is safe lawful modern and secure and to pursue a foreign policy that puts americas interests first there is a new opportunity in american politics if only we have the courag

In [14]:
# Tredje runde cleaning
def clean_text_round3(text):
    # Fjerner \xa0
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    return text

round3 = lambda x: clean_text_round3(x)

In [15]:
# Og igen et transcript - nu ser det pænt ud
data_clean = pd.DataFrame(data_clean.transcript.apply(round3))
data_clean.transcript.loc['carter']

'mr president mr speaker members of the congress fellow citizens this last few months has not been an easy time for any of us as we meet tonight it has never been more clear that the state of our union depends on the state of the world and tonight as throughout our own generation freedom and peace in the world depend on the state of our union the have been born in turmoil strife and change this is a time of challenge to our interests and our values and its a time that tests our wisdom and our skills at this time in iran americans are still held captive innocent victims of terrorism and anarchy also at this moment massive soviet troops are attempting to subjugate the fiercely independent and deeply religious people of afghanistan these two acts one of international terrorism and one of military aggression present a serious challenge to the united states of america and indeed to all the nations of the world together we will meet these threats to peace im determined that the united states

In [16]:
data_clean

Unnamed: 0,transcript
bushjunior,madam speaker vice president cheney members of congress distinguished guests and fellow citizens seven years have passed since i first stood befor...
bushsenior,mr speaker and mr president distinguished members of congress honored guests and fellow citizens thank you very much for that warm reception you k...
carter,mr president mr speaker members of the congress fellow citizens this last few months has not been an easy time for any of us as we meet tonight it...
clinton,mr speaker mr vice president members of congress honored guests my fellow americans we are fortunate to be alive at this moment in history never b...
johnson,mr speaker mr president members of the congress and my fellow americans for the sixth and the last time i present to the congress my assessment ...
kennedy,mr vice president mr speaker members of the congress i congratulate you allnot merely on your electoral victory but on your selected role in histo...
nixon,mr speaker mr president my colleagues in the congress our distinguished guests my fellow americans\rwe meet here tonight at a time of great challe...
obama,mr speaker mr vice president members of congress my fellow americans tonight marks the eighth year that ive come here to report on the state of th...
reagan,mr speaker mr president and distinguished members of the house and senate when we first met here seven years agomany of us for the first time it w...
trump,madam speaker mr vice president members of congress the first lady of the united states and my fellow americans we meet tonight at a moment of unl...


In [17]:
data_df

Unnamed: 0,transcript
bushjunior,"Madam Speaker, Vice President Cheney, members of Congress, distinguished guests, and fellow citizens: Seven years have passed since I first stood ..."
bushsenior,"Mr. Speaker and Mr. President, distinguished members of Congress, honored guests, and fellow citizens: Thank you very much for that warm reception..."
carter,"Mr. President, Mr. Speaker, members of the 96th Congress, fellow citizens: This last few months has not been an easy time for any of us. As we mee..."
clinton,"Mr. Speaker, Mr. Vice President, members of Congress, honored guests, my fellow Americans: We are fortunate to be alive at this moment in history...."
johnson,"Mr. Speaker, Mr. President, Members of the Congress and my fellow Americans For the sixth and the last time, I present to the Congress my assess..."
kennedy,"Mr. Vice President, Mr. Speaker, Members of the 88th Congress: I congratulate you all--not merely on your electoral victory but on your selected r..."
nixon,"Mr. Speaker, Mr. President, my colleagues in the Congress, our distinguished guests, my fellow Americans:\r\nWe meet here tonight at a time of gre..."
obama,"Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans: Tonight marks the eighth year that I’ve come here to report on the Stat..."
reagan,"Mr. Speaker, Mr. President, and distinguished Members of the House and Senate: When we first met here seven years ago-many of us for the first tim..."
trump,"Madam Speaker, Mr. Vice President, Members of Congress, the First Lady of the United States, and my fellow Americans: We meet tonight at a moment ..."


In [18]:
# Let's add the presidents' full names as well
full_names = ['George W. Bush', 'George H.W. Bush', 'Jimmy Carter', 'Bill Clinton', 'Lyndon B. Johnson', 'John F. Kennedy', 'Richard Nixon', 'Barack Obama', 'Ronald Reagan', 'Donald Trump']
run_times = [53, 52, 33, 90, 44, 45, 44, 61, 44, 52]
data_df['full_name'] = full_names
data_df['speech_time'] = run_times
data_df

Unnamed: 0,transcript,full_name,speech_time
bushjunior,"Madam Speaker, Vice President Cheney, members of Congress, distinguished guests, and fellow citizens: Seven years have passed since I first stood ...",George W. Bush,53
bushsenior,"Mr. Speaker and Mr. President, distinguished members of Congress, honored guests, and fellow citizens: Thank you very much for that warm reception...",George H.W. Bush,52
carter,"Mr. President, Mr. Speaker, members of the 96th Congress, fellow citizens: This last few months has not been an easy time for any of us. As we mee...",Jimmy Carter,33
clinton,"Mr. Speaker, Mr. Vice President, members of Congress, honored guests, my fellow Americans: We are fortunate to be alive at this moment in history....",Bill Clinton,90
johnson,"Mr. Speaker, Mr. President, Members of the Congress and my fellow Americans For the sixth and the last time, I present to the Congress my assess...",Lyndon B. Johnson,44
kennedy,"Mr. Vice President, Mr. Speaker, Members of the 88th Congress: I congratulate you all--not merely on your electoral victory but on your selected r...",John F. Kennedy,45
nixon,"Mr. Speaker, Mr. President, my colleagues in the Congress, our distinguished guests, my fellow Americans:\r\nWe meet here tonight at a time of gre...",Richard Nixon,44
obama,"Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans: Tonight marks the eighth year that I’ve come here to report on the Stat...",Barack Obama,61
reagan,"Mr. Speaker, Mr. President, and distinguished Members of the House and Senate: When we first met here seven years ago-many of us for the first tim...",Ronald Reagan,44
trump,"Madam Speaker, Mr. Vice President, Members of Congress, the First Lady of the United States, and my fellow Americans: We meet tonight at a moment ...",Donald Trump,52


In [19]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [20]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aaron,abandon,abandoned,abandoning,abandonment,abhorrent,ability,abject,able,ablebodied,...,young,youre,youth,youthful,youve,zeitchik,zero,zimbabwe,zone,zones
bushjunior,0,1,0,0,0,0,4,0,0,0,...,2,0,0,0,0,0,0,1,0,0
bushsenior,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
carter,0,1,1,0,0,1,2,0,0,0,...,2,0,1,0,0,0,0,0,0,0
clinton,2,0,0,0,0,0,1,0,4,0,...,9,1,0,0,0,0,0,0,0,1
johnson,0,0,0,0,0,0,0,0,10,0,...,2,0,0,0,0,0,0,0,0,0
kennedy,0,1,0,0,1,0,4,0,0,0,...,3,0,2,1,0,0,0,0,0,0
nixon,0,0,0,1,0,0,1,0,4,0,...,2,0,1,0,0,0,1,0,0,0
obama,0,0,0,0,0,0,0,0,1,0,...,3,0,0,0,1,0,0,0,0,0
reagan,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,0
trump,0,0,0,0,0,0,0,1,3,0,...,5,0,0,0,0,2,0,0,0,0


In [21]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [22]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

