# Scraping Presidential Speeches

Scrapes the campaign speeches of Democratic and Republican candidates from each election cycle available on the UCSB preidency 

In [74]:
import re
import time
import numpy as np
import pandas as pd
import json

from lxml import html
from bs4 import BeautifulSoup
import requests

from __future__ import division
import nltk, re, pprint
from nltk import word_tokenize

General scraping parameters from the UCSB Presidential Election Database:

In [2]:
ucsb_base_url = "http://www.presidency.ucsb.edu/"

In [3]:
"""
Input: URL
Output: BeautifulSoup object of page contents
"""
def get_bs_html_text(page_url):
    return BeautifulSoup(page_url.text, 'html.parser')

In [4]:
"""
Input: string
Output: string with initial superdirectory prefix removed
"""
def remove_super_dir_prefix(url_path):
    return url_path.replace("../","")

### Automating process to get a list of links

List of candidates:
- Trump 2016: http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=45&campaign=2016TRUMP&doctype=5000

- Hillary 2016: http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=70&campaign=2016CLINTON&doctype=5000

- Obama 2012: http://www.presidency.ucsb.edu/2012_election_speeches.php?candidate=44&doctype=1150

- Romney 2012: http://www.presidency.ucsb.edu/2012_election_speeches.php?candidate=79&campaign=2012ROMNEY&doctype=5000

- Obama 2008: http://www.presidency.ucsb.edu/2008_election_speeches.php?candidate=44&campaign=2008OBAMA&doctype=5000

- McCain 2008: http://www.presidency.ucsb.edu/2008_election_speeches.php?candidate=68&campaign=2008MCCAIN&doctype=5000

- Kerry 2004: http://www.presidency.ucsb.edu/2004_election_speeches.php?candidate=67&campaign=2004KERRY

Hard-coded URLs for scraping--not sure if there was an automated process available:

In [100]:
campaign_dict = {
    'Trump_2016': 'http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=45&campaign=2016TRUMP&doctype=5000',
    'Hillary_2016': 'http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=70&campaign=2016CLINTON&doctype=5000',
    'Obama_2012': 'http://www.presidency.ucsb.edu/2012_election_speeches.php?candidate=44&doctype=1150',
    'Romney_2012': 'http://www.presidency.ucsb.edu/2012_election_speeches.php?candidate=79&campaign=2012ROMNEY&doctype=5000',
    'Obama_2008': 'http://www.presidency.ucsb.edu/2008_election_speeches.php?candidate=44&campaign=2008OBAMA&doctype=5000',
    'McCain_2008': 'http://www.presidency.ucsb.edu/2008_election_speeches.php?candidate=68&campaign=2008MCCAIN&doctype=5000',
    'Kerry_2004': 'http://www.presidency.ucsb.edu/2004_election_speeches.php?candidate=67&campaign=2004KERRY'}

In [108]:
%%time

# iterate over each candidate and campaign
for candidate in campaign_dict:
    
    print candidate
    
    # get candidate campaign landing page
    candidate_page = get_bs_html_text(requests.get(campaign_dict[candidate]))
    speech_table = candidate_page.findAll('table',{'align': 'center'})[0]
    
    # store dictionary of speech titles, dates, and links
    speeches_dict = {}
    speech_rows = speech_table.findAll('tr')

    # grab all speech links
    for row in speech_rows:
        
        if row.has_attr('bgcolor'):

            # record relevant information
            link_container = row.find('a')
            link = link_container['href']
            title = link_container.contents[0]
            row_date = row.findAll('td')[1].contents[0]

            # new data fields
            new_url = ucsb_base_url + remove_super_dir_prefix(link)
            speech_id = title + "; " + row_date

            speeches_dict[speech_id] = {'url': new_url, 'date': row_date}
            
    # scrape all speech pages
    for title in speeches_dict:
        
        # grab speech information
        url = speeches_dict[title]['url']
        speech_page = get_bs_html_text(requests.get(url))
        speech = speech_page.findAll('span', {'class': 'displaytext'})[0]
        speeches_dict[title]['speech'] = speech.get_text()

        time.sleep(1)
    
    # write json for each candidate campaign
    json_file_name = '../out/' + candidate + '.json'
    with open(json_file_name, 'w') as outfile:
        json.dump(speeches_dict, outfile)

November 3, 2012
September 1, 2012
July 23, 2012
July 5, 2012
August 13, 2012
August 12, 2012
October 4, 2012
November 4, 2012
September 7, 2012
September 17, 2012
August 1, 2012
September 13, 2012
August 22, 2012
September 18, 2008
January 26, 2008
August 21, 2007
August 4, 2008
November 4, 2008
July 24, 2008
February 7, 2008
September 10, 2008
October 26, 2008
September 8, 2008
September 25, 2008
June 25, 2008
June 26, 2008
June 27, 2008
October 9, 2008
October 16, 2008
September 24, 2008
June 15, 2008
September 24, 2008
April 14, 2008
June 23, 2007
September 30, 2008
October 23, 2008
October 27, 2008
July 20, 2008
September 12, 2008
June 22, 2007
June 24, 2008
January 8, 2008
September 6, 2016
October 15, 2016
September 14, 2016
April 27, 2016
July 27, 2016
July 11, 2016
September 12, 2016
November 2, 2012
September 24, 2012
October 5, 2012
May 17, 2012
September 1, 2012
October 18, 2012
October 12, 2012
September 25, 2012
October 12, 2012
April 13, 2012
August 30, 2011
February 7, 

### Process speech content

See http://www.nltk.org/book/ch03.html#fig-pipeline1 and surrounding text for more background information on the process.

In [11]:
dt_speech_tokens = word_tokenize(dt_candidacy_speech.get_text())

In [12]:
dt_speech_ttxt = nltk.Text(dt_speech_tokens)

In [13]:
dt_speech_words = [re.sub('[^A-Za-z]+', '', w.lower()) for w in dt_speech_ttxt]
dt_speech_words = [w for w in dt_speech_words if len(w) > 1]

In [14]:
dt_speech_vocab = sorted(set(dt_speech_words))

In [15]:
dt_speech_words = pd.Series(dt_speech_words)

In [17]:
dt_word_counts = pd.DataFrame(dt_speech_words.value_counts())
dt_word_counts.columns = ['count']