In [1]:
import numpy as np
import pandas as pd
import re, string
from lxml import etree, html

In [2]:
doc = html.parse('http://millercenter.org/president/speeches').getroot()
doc.make_links_absolute()

In [3]:
parser = etree.HTMLParser()
tree = etree.parse('http://millercenter.org/president/speeches', parser)

In [4]:
pres = [link.text_content().replace(' ↑', '') for link in doc.cssselect('h2.president')]

In [5]:
pres_line = [elt.sourceline for elt in tree.getiterator('h2')
             if elt.get('class') == 'president']

In [6]:
presidents = list(zip(pres_line, pres))

In [7]:
#dummy value
presidents.append((100000000, None))

In [8]:
speeches = []
duplicates = ['Transcript', 'Audio', 'Video']
for i in range(len(presidents)):
    speech_list = []
    for link in doc.cssselect('div.entry a'):
        text = link.text_content()
        if presidents[i][0] < link.sourceline < presidents[i+1][0] and text not in duplicates:
            if re.findall(r'\(\w*\s\d*,\s\d*\)', text):
                date = re.findall(r'\(\w*\s\d*,\s\d*\)', text)[0]
                date = re.sub('['+string.punctuation+']', '', date)
                title = re.sub(r'\s\(\w*\s\d*,\s\d*\)', '', text)
            else:
                date = np.nan
                title = re.sub(r'\s\(.*\)', '', text)
            speeches.append([presidents[i][1],
                            title,
                            date,
                            link.get('href')])

In [9]:
columns = ['president', 'title', 'date', 'link']
df_speeches = pd.DataFrame(np.array(speeches), columns=columns)

In [10]:
df_speeches['date'] = pd.to_datetime(df_speeches['date'])

In [11]:
df_speeches.dropna(inplace=True)

In [12]:
df_speeches.head()

Unnamed: 0,president,title,date,link
0,Barack Obama,Acceptance Speech at the Democratic National C...,2008-08-28,http://millercenter.org/president/obama/speech...
1,Barack Obama,Remarks on Election Night,2008-11-04,http://millercenter.org/president/obama/speech...
2,Barack Obama,Inaugural Address,2009-01-20,http://millercenter.org/president/obama/speech...
3,Barack Obama,Remarks on the Lilly Ledbetter Fair Pay Restor...,2009-01-29,http://millercenter.org/president/obama/speech...
4,Barack Obama,Remarks on the American Recovery and Reinvestm...,2009-02-07,http://millercenter.org/president/obama/speech...


In [13]:
def get_transcript(link):
    try:
        speech_doc = html.parse(link).getroot()
        speech = ' '.join(speech_doc.cssselect('div.indent')[0].itertext()).replace(u'\xa0', ' ')
        speech = re.sub(r'^\s*Transcript\s*', '', speech)
        speech = re.sub(r'\s*$', '', speech)
        return speech
    except:
        return np.nan

In [14]:
df_speeches['transcript'] = df_speeches['link'].apply(get_transcript)

In [15]:
df_speeches.dropna(inplace=True)

In [16]:
df_speeches.head()

Unnamed: 0,president,title,date,link,transcript
0,Barack Obama,Acceptance Speech at the Democratic National C...,2008-08-28,http://millercenter.org/president/obama/speech...,To Chairman Dean and my great friend Dick Durb...
1,Barack Obama,Remarks on Election Night,2008-11-04,http://millercenter.org/president/obama/speech...,If there is anyone out there who still doubts ...
2,Barack Obama,Inaugural Address,2009-01-20,http://millercenter.org/president/obama/speech...,I stand here today humbled by the task before ...
3,Barack Obama,Remarks on the Lilly Ledbetter Fair Pay Restor...,2009-01-29,http://millercenter.org/president/obama/speech...,It is fitting that with the very first bill I ...
4,Barack Obama,Remarks on the American Recovery and Reinvestm...,2009-02-07,http://millercenter.org/president/obama/speech...,"Thank you, everybody. Please have a seat. Yo..."


In [17]:
df_speeches.to_csv('speeches_raw.csv', index=False)

In [18]:
candidates = []
for url in ['http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=70&campaign=2016CLINTON&doctype=5000',
            'http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=115&campaign=2016TRUMP&doctype=5000']:
    doc = html.parse(url).getroot()
    doc.make_links_absolute()

    links = [(link.text_content(), link.find('a').get('href'))
             for link in doc.cssselect('td.listdate')
             if link.find('a') is not None]

    data = [link.text_content() for link in doc.cssselect('td.listdate') if link.find('a') is None]
    data = list(zip(data[::2], data[1::2]))

    joined = list(zip(data, links))

    joined = [list(a + b) for a, b in joined]
    candidates = candidates + joined

In [19]:
columns = ['president', 'date', 'title', 'link']
df_candidates = pd.DataFrame(np.array(candidates), columns=columns)
df_candidates['date'] = pd.to_datetime(df_candidates['date'])

In [20]:
df_candidates = df_candidates[df_candidates['date'] > pd.to_datetime('2015-01-01')]

In [21]:
df_candidates = df_candidates[df_candidates['title'].str.contains('Interview|Question and Answer') == False]

In [22]:
def cand_transcript(url):
    doc = html.parse(url).getroot()
    doc.make_links_absolute()
    return ' '.join(doc.cssselect('span.displaytext')[0].itertext())

In [23]:
df_candidates['transcript'] = df_candidates['link'].apply(cand_transcript)

In [24]:
df_candidates = df_candidates[df_candidates['transcript'].str.contains(r'^(.{,20}?):') == False]
df_candidates.drop(163, axis=0, inplace=True)

  if __name__ == '__main__':


In [25]:
df_candidates.to_csv('candidates.csv', index=False)