## Data Cleaning

In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from millercenter.org
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="transcript-inner").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['https://millercenter.org/the-presidency/presidential-speeches/february-5-2019-state-union-address',
       'https://millercenter.org/the-presidency/presidential-speeches/january-12-2016-2016-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-28-2008-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-27-2000-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-28-1992-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-25-1988-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-23-1980-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-30-1974-state-union-address',
        'https://millercenter.org/the-presidency/presidential-speeches/january-14-1969-state-union-address',
       'https://millercenter.org/the-presidency/presidential-speeches/january-14-1963-state-union-address']

# Presidents names
presidents = ['trump', 'obama', 'bushjunior', 'clinton', 'bushsenior', 'reagan', 'carter', 'nixon', 'johnson', 'kennedy']

In [2]:
# Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]


https://millercenter.org/the-presidency/presidential-speeches/february-5-2019-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-12-2016-2016-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-28-2008-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-27-2000-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-28-1992-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-25-1988-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-23-1980-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-30-1974-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-14-1969-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-14-1963-state-union-address


In [3]:
# # Pickle files for later use

#  Make a new directory to hold the text files
# !mkdir transcripts

for i, p in enumerate(presidents):
    with open("transcripts/" + p + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

In [4]:
# Load pickled files
data = {}
for i, c in enumerate(presidents):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [5]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['trump', 'obama', 'bushjunior', 'clinton', 'bushsenior', 'reagan', 'carter', 'nixon', 'johnson', 'kennedy'])

In [6]:
# More checks
data['obama'][:4]

['Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans:',
 "Tonight marks the eighth year that I’ve come here to report on the State of the Union.\xa0And for this final one, I’m going to try to make it a little shorter. (Applause.) I know some of you are antsy to get back to Iowa. (Laughter.) I've been there. I'll be shaking hands afterwards if you want some tips. (Laughter.)",
 'And I understand that because it’s an election season, expectations for what we will achieve this year are low.\xa0But, Mr. Speaker, I appreciate the constructive approach that you and the other leaderstook at the end of last year to\xa0pass a budget\xa0and\xa0make\xa0tax cuts permanent\xa0for working families. So I hope we can work together this year on some bipartisan priorities like\xa0criminal justice reform\xa0-- (applause) -- and helping people who are battling prescription drug abuse and heroin abuse. (Applause.) So, who knows, we might surprise the cynics again.',
 "But tonight, I 

In [7]:
# Let's take a look at our data again
next(iter(data.keys()))

'trump'

In [8]:
# Notice that our dictionary is currently in key: president, value: list of text format
next(iter(data.values()))

['Madam Speaker, Mr. Vice President, Members of Congress, the\xa0First Lady of the United States, and my fellow Americans:',
 'We meet tonight at a moment of unlimited potential.\xa0As we begin a new Congress, I stand here ready to work with you to achieve historic breakthroughs for all Americans.',
 'Millions of our fellow citizens are watching us now, gathered in this great chamber, hoping that we will govern not as two parties but as one Nation.',
 'The agenda I will lay out this evening is not a Republican agenda or a Democrat agenda.\xa0It is the agenda of the American people.',
 'Many of us campaigned on the same core promises:\xa0to defend American jobs and demand fair trade for American workers; to rebuild and revitalize our Nation’s infrastructure; to reduce the price of healthcare and prescription drugs; to create an immigration system that is safe, lawful, modern, and secure; and to pursue a foreign policy that puts America’s interests first.',
 'There is a new opportunity i

In [9]:
# We are going to change this to key: president, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [10]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [11]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
bushjunior,"Madam Speaker, Vice President Cheney, members of Congress, distinguished guests, and fellow citizens: Seven years have passed since I first stood ..."
bushsenior,"Mr. Speaker and Mr. President, distinguished members of Congress, honored guests, and fellow citizens: Thank you very much for that warm reception..."
carter,"Mr. President, Mr. Speaker, members of the 96th Congress, fellow citizens: This last few months has not been an easy time for any of us. As we mee..."
clinton,"Mr. Speaker, Mr. Vice President, members of Congress, honored guests, my fellow Americans: We are fortunate to be alive at this moment in history...."
johnson,"Mr. Speaker, Mr. President, Members of the Congress and my fellow Americans For the sixth and the last time, I present to the Congress my assess..."
kennedy,"Mr. Vice President, Mr. Speaker, Members of the 88th Congress: I congratulate you all--not merely on your electoral victory but on your selected r..."
nixon,"Mr. Speaker, Mr. President, my colleagues in the Congress, our distinguished guests, my fellow Americans:\r\nWe meet here tonight at a time of gre..."
obama,"Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans: Tonight marks the eighth year that I’ve come here to report on the Stat..."
reagan,"Mr. Speaker, Mr. President, and distinguished Members of the House and Senate: When we first met here seven years ago-many of us for the first tim..."
trump,"Madam Speaker, Mr. Vice President, Members of Congress, the First Lady of the United States, and my fellow Americans: We meet tonight at a moment ..."


In [12]:
# Let's take a look at the transcript for obama:
data_df.transcript.loc['bushsenior']

'Mr. Speaker and Mr. President, distinguished members of Congress, honored guests, and fellow citizens: Thank you very much for that warm reception. You know, with the big buildup this address has had, I wanted to make sure it would be a big hit, but I couldn\'t convince Barbara to deliver it for me. I see the Speaker and the Vice President are laughing. They saw what I did in Japan, and they\'re just happy they\'re sitting behind me. I mean to speak tonight of big things, of big changes and the promises they hold, and of some big problems and how, together, we can solve them and move our country forward as the undisputed leader of the age. We gather tonight at a dramatic and deeply promising time in our history and in the history of man on Earth. For in the past 12 months, the world has known changes of almost biblical proportions. And even now, months after the failed coup that doomed a failed system, I\'m not sure we\'ve absorbed the full impact, the full import of what happened. Bu

In [13]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [14]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean.transcript.loc['obama']

'mr speaker mr vice president members of congress my fellow americans tonight marks the eighth year that i’ve come here to report on the state of the union\xa0and for this final one i’m going to try to make it a little shorter applause i know some of you are antsy to get back to iowa laughter ive been there ill be shaking hands afterwards if you want some tips laughter and i understand that because it’s an election season expectations for what we will achieve this year are low\xa0but mr speaker i appreciate the constructive approach that you and the other leaderstook at the end of last year to\xa0pass a budget\xa0and\xa0make\xa0tax cuts permanent\xa0for working families so i hope we can work together this year on some bipartisan priorities like\xa0criminal justice reform\xa0 applause  and helping people who are battling prescription drug abuse and heroin abuse applause so who knows we might surprise the cynics again but tonight i want to go easy on the traditional list of proposals for

In [15]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('applause', '', text)
    text = re.sub('yearsand', 'years and', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [16]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean.transcript.loc['obama']

'mr speaker mr vice president members of congress my fellow americans tonight marks the eighth year that ive come here to report on the state of the union\xa0and for this final one im going to try to make it a little shorter  i know some of you are antsy to get back to iowa laughter ive been there ill be shaking hands afterwards if you want some tips laughter and i understand that because its an election season expectations for what we will achieve this year are low\xa0but mr speaker i appreciate the constructive approach that you and the other leaderstook at the end of last year to\xa0pass a budget\xa0and\xa0make\xa0tax cuts permanent\xa0for working families so i hope we can work together this year on some bipartisan priorities like\xa0criminal justice reform\xa0   and helping people who are battling prescription drug abuse and heroin abuse  so who knows we might surprise the cynics again but tonight i want to go easy on the traditional list of proposals for the year ahead dont worry 

In [17]:
# Apply a second round of cleaning
def clean_text_round3(text):
    '''Get rid of \xa0 sequences'''
    
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    return text

round3 = lambda x: clean_text_round3(x)

In [18]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round3))
data_clean.transcript.loc['obama']

'mr speaker mr vice president members of congress my fellow americans tonight marks the eighth year that ive come here to report on the state of the union and for this final one im going to try to make it a little shorter  i know some of you are antsy to get back to iowa laughter ive been there ill be shaking hands afterwards if you want some tips laughter and i understand that because its an election season expectations for what we will achieve this year are low but mr speaker i appreciate the constructive approach that you and the other leaderstook at the end of last year to pass a budget and make tax cuts permanent for working families so i hope we can work together this year on some bipartisan priorities like criminal justice reform    and helping people who are battling prescription drug abuse and heroin abuse  so who knows we might surprise the cynics again but tonight i want to go easy on the traditional list of proposals for the year ahead dont worry ive got plenty from helping

In [19]:
data_df

Unnamed: 0,transcript
bushjunior,"Madam Speaker, Vice President Cheney, members of Congress, distinguished guests, and fellow citizens: Seven years have passed since I first stood ..."
bushsenior,"Mr. Speaker and Mr. President, distinguished members of Congress, honored guests, and fellow citizens: Thank you very much for that warm reception..."
carter,"Mr. President, Mr. Speaker, members of the 96th Congress, fellow citizens: This last few months has not been an easy time for any of us. As we mee..."
clinton,"Mr. Speaker, Mr. Vice President, members of Congress, honored guests, my fellow Americans: We are fortunate to be alive at this moment in history...."
johnson,"Mr. Speaker, Mr. President, Members of the Congress and my fellow Americans For the sixth and the last time, I present to the Congress my assess..."
kennedy,"Mr. Vice President, Mr. Speaker, Members of the 88th Congress: I congratulate you all--not merely on your electoral victory but on your selected r..."
nixon,"Mr. Speaker, Mr. President, my colleagues in the Congress, our distinguished guests, my fellow Americans:\r\nWe meet here tonight at a time of gre..."
obama,"Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans: Tonight marks the eighth year that I’ve come here to report on the Stat..."
reagan,"Mr. Speaker, Mr. President, and distinguished Members of the House and Senate: When we first met here seven years ago-many of us for the first tim..."
trump,"Madam Speaker, Mr. Vice President, Members of Congress, the First Lady of the United States, and my fellow Americans: We meet tonight at a moment ..."


In [20]:
# Let's add the presidents' full names as well
full_names = ['George W. Bush', 'George H.W. Bush', 'Jimmy Carter', 'Bill Clinton', 'Lyndon B. Johnson', 'John F. Kennedy', 'Richard Nixon', 'Barack Obama', 'Ronald Reagan', 'Donald Trump']
run_times = [53, 52, 33, 90, 44, 45, 44, 61, 44, 52]
data_df['full_name'] = full_names
data_df['speech_time'] = run_times
data_df

Unnamed: 0,transcript,full_name,speech_time
bushjunior,"Madam Speaker, Vice President Cheney, members of Congress, distinguished guests, and fellow citizens: Seven years have passed since I first stood ...",George W. Bush,53
bushsenior,"Mr. Speaker and Mr. President, distinguished members of Congress, honored guests, and fellow citizens: Thank you very much for that warm reception...",George H.W. Bush,52
carter,"Mr. President, Mr. Speaker, members of the 96th Congress, fellow citizens: This last few months has not been an easy time for any of us. As we mee...",Jimmy Carter,33
clinton,"Mr. Speaker, Mr. Vice President, members of Congress, honored guests, my fellow Americans: We are fortunate to be alive at this moment in history....",Bill Clinton,90
johnson,"Mr. Speaker, Mr. President, Members of the Congress and my fellow Americans For the sixth and the last time, I present to the Congress my assess...",Lyndon B. Johnson,44
kennedy,"Mr. Vice President, Mr. Speaker, Members of the 88th Congress: I congratulate you all--not merely on your electoral victory but on your selected r...",John F. Kennedy,45
nixon,"Mr. Speaker, Mr. President, my colleagues in the Congress, our distinguished guests, my fellow Americans:\r\nWe meet here tonight at a time of gre...",Richard Nixon,44
obama,"Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans: Tonight marks the eighth year that I’ve come here to report on the Stat...",Barack Obama,61
reagan,"Mr. Speaker, Mr. President, and distinguished Members of the House and Senate: When we first met here seven years ago-many of us for the first tim...",Ronald Reagan,44
trump,"Madam Speaker, Mr. Vice President, Members of Congress, the First Lady of the United States, and my fellow Americans: We meet tonight at a moment ...",Donald Trump,52


In [21]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [22]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aaron,abandon,abandoned,abandoning,abandonment,abhorrent,ability,abject,able,ablebodied,...,young,youre,youth,youthful,youve,zeitchik,zero,zimbabwe,zone,zones
bushjunior,0,1,0,0,0,0,4,0,0,0,...,2,0,0,0,0,0,0,1,0,0
bushsenior,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
carter,0,1,1,0,0,1,2,0,0,0,...,2,0,1,0,0,0,0,0,0,0
clinton,2,0,0,0,0,0,1,0,4,0,...,9,1,0,0,0,0,0,0,0,1
johnson,0,0,0,0,0,0,0,0,10,0,...,2,0,0,0,0,0,0,0,0,0
kennedy,0,1,0,0,1,0,4,0,0,0,...,3,0,2,1,0,0,0,0,0,0
nixon,0,0,0,1,0,0,1,0,4,0,...,2,0,1,0,0,0,1,0,0,0
obama,0,0,0,0,0,0,0,0,1,0,...,3,0,0,0,1,0,0,0,0,0
reagan,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,0
trump,0,0,0,0,0,0,0,1,3,0,...,5,0,0,0,0,2,0,0,0,0


In [23]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [24]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))