In [1]:
import re
import time
import numpy as np
import pandas as pd
from itertools import groupby

from lxml import html
from bs4 import BeautifulSoup
import requests

from __future__ import division  # Python 2 users only
import nltk, re, pprint
from nltk import word_tokenize
from pattern.en import parse

# from selenium import webdriver

General scraping parameters from the UCSB Presidential Election Database:

In [2]:
ucsb_base_url = "http://www.presidency.ucsb.edu/"
all_elections_list_page = requests.get(ucsb_base_url + "2016_election.php")

In [3]:
"""
Input: URL
Output: BeautifulSoup object of page contents
"""
def get_bs_html_text(page_url):
    return BeautifulSoup(page_url.text, 'html.parser')

In [4]:
"""
Input: string
Output: string with initial superdirectory prefix removed
"""
def remove_super_dir_prefix(url_path):
    return url_path.replace("../","")

# Trump scraping

### Get all links from specific candidate and document type

Scrape all links from the page for all documents regarding Trump's election speeches and remarks only.

In [5]:
# ucsb page specific to trump
dt_speeches_remarks_url = ucsb_base_url + \
    "2016_election_speeches.php?candidate=45&campaign=2016TRUMP&doctype=5000"

dt_speech_list_page = requests.get(dt_speeches_remarks_url)
dt_speech_list_page = get_bs_html_text(dt_speech_list_page)

Most speech lists are found in a table that is center-aligned.

In [6]:
dt_speech_table = dt_speech_list_page.findAll('table', {'align': 'center'})[0]

In [7]:
dt_speech_links = [elem['href'] for elem in dt_speech_table.findAll('a')]

### Get speech content

Get text from a specific page containing speech info:

In [8]:
dt_candidacy_url = ucsb_base_url + remove_super_dir_prefix(dt_speech_links[0])
dt_candidacy_page = requests.get(dt_candidacy_url)

In [9]:
dt_candidacy_page = get_bs_html_text(dt_candidacy_page)

Most speeches contents are found in a span container with class displaytext.

In [10]:
dt_candidacy_speech = dt_candidacy_page.findAll('span', {'class': 'displaytext'})[0]

### Process speech content

See http://www.nltk.org/book/ch03.html#fig-pipeline1 and surrounding text for more background information on the process.

In [11]:
dt_speech_tokens = word_tokenize(dt_candidacy_speech.get_text())

In [12]:
dt_speech_ttxt = nltk.Text(dt_speech_tokens)

In [13]:
dt_speech_words = [re.sub('[^A-Za-z]+', '', w.lower()) for w in dt_speech_ttxt]
dt_speech_words = [w for w in dt_speech_words if len(w) > 1]

In [14]:
dt_speech_vocab = sorted(set(dt_speech_words))

In [15]:
dt_speech_words = pd.Series(dt_speech_words)

In [17]:
dt_word_counts = pd.DataFrame(dt_speech_words.value_counts())
dt_word_counts.columns = ['count']