## Data Cleaning

In [9]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="transcript-inner").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['https://millercenter.org/the-presidency/presidential-speeches/february-5-2019-state-union-address',
       'https://millercenter.org/the-presidency/presidential-speeches/january-12-2016-2016-state-union-address']

# Comedian names
presidents = ['trump', 'obama']

In [10]:
# # Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]


https://millercenter.org/the-presidency/presidential-speeches/february-5-2019-state-union-address
https://millercenter.org/the-presidency/presidential-speeches/january-12-2016-2016-state-union-address


In [25]:
# # Pickle files for later use

#  Make a new directory to hold the text files
!mkdir transcripts

for i, p in enumerate(presidents):
    with open("transcripts/" + p + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

A subdirectory or file transcripts already exists.


In [27]:
# Load pickled files
data = {}
for i, c in enumerate(presidents):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [28]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['trump', 'obama'])

In [33]:
# More checks
data['obama'][:4]

['Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans:',
 "Tonight marks the eighth year that I’ve come here to report on the State of the Union.\xa0And for this final one, I’m going to try to make it a little shorter. (Applause.) I know some of you are antsy to get back to Iowa. (Laughter.) I've been there. I'll be shaking hands afterwards if you want some tips. (Laughter.)",
 'And I understand that because it’s an election season, expectations for what we will achieve this year are low.\xa0But, Mr. Speaker, I appreciate the constructive approach that you and the other leaderstook at the end of last year to\xa0pass a budget\xa0and\xa0make\xa0tax cuts permanent\xa0for working families. So I hope we can work together this year on some bipartisan priorities like\xa0criminal justice reform\xa0-- (applause) -- and helping people who are battling prescription drug abuse and heroin abuse. (Applause.) So, who knows, we might surprise the cynics again.',
 "But tonight, I 

In [36]:
# Let's take a look at our data again
next(iter(data.keys()))

'trump'

In [37]:
# Notice that our dictionary is currently in key: comedian, value: list of text format
next(iter(data.values()))

['Madam Speaker, Mr. Vice President, Members of Congress, the\xa0First Lady of the United States, and my fellow Americans:',
 'We meet tonight at a moment of unlimited potential.\xa0As we begin a new Congress, I stand here ready to work with you to achieve historic breakthroughs for all Americans.',
 'Millions of our fellow citizens are watching us now, gathered in this great chamber, hoping that we will govern not as two parties but as one Nation.',
 'The agenda I will lay out this evening is not a Republican agenda or a Democrat agenda.\xa0It is the agenda of the American people.',
 'Many of us campaigned on the same core promises:\xa0to defend American jobs and demand fair trade for American workers; to rebuild and revitalize our Nation’s infrastructure; to reduce the price of healthcare and prescription drugs; to create an immigration system that is safe, lawful, modern, and secure; and to pursue a foreign policy that puts America’s interests first.',
 'There is a new opportunity i

In [38]:
# We are going to change this to key: comedian, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [39]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [41]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
obama,"Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans: Tonight marks the eighth year that I’ve come here to report on the Stat..."
trump,"Madam Speaker, Mr. Vice President, Members of Congress, the First Lady of the United States, and my fellow Americans: We meet tonight at a moment ..."


In [42]:
# Let's take a look at the transcript for Ali Wong
data_df.transcript.loc['trump']

'Madam Speaker, Mr. Vice President, Members of Congress, the\xa0First Lady of the United States, and my fellow Americans: We meet tonight at a moment of unlimited potential.\xa0As we begin a new Congress, I stand here ready to work with you to achieve historic breakthroughs for all Americans. Millions of our fellow citizens are watching us now, gathered in this great chamber, hoping that we will govern not as two parties but as one Nation. The agenda I will lay out this evening is not a Republican agenda or a Democrat agenda.\xa0It is the agenda of the American people. Many of us campaigned on the same core promises:\xa0to defend American jobs and demand fair trade for American workers; to rebuild and revitalize our Nation’s infrastructure; to reduce the price of healthcare and prescription drugs; to create an immigration system that is safe, lawful, modern, and secure; and to pursue a foreign policy that puts America’s interests first. There is a new opportunity in American politics, 