In [None]:
# Web Scraping Political Speeches from the 2016 US Presidential Election

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import numpy as np
import csv
import pandas

In [None]:
source_url = "http://www.presidency.ucsb.edu/2016_election.php"
base_url = "http://www.presidency.ucsb.edu/"
response = requests.get(source_url)
soup = bs(response.text, "html.parser")

In [None]:
def to_text(bs_expression):
    out = []
    for elem in bs_expression:
        out.append(elem.text)
    return out

In [None]:
#specifies which candidates we want to scrape (by indicating their row index)
#these are the indices for all Republicans
candidate_rows = [soup.select("td.doctext")[i] for i in [j * 2 + 10 for j in range(16)]]

#for dems, uncomment the line below
# candidate_rows = [soup.select("td.doctext")[i] for i in [j * 2 for j in range(5)]]

In [None]:
#gets names and last names of all candidates
names = [to_text(cand.find_all("span"))[0] for cand in candidate_rows]
lastnames = [name.split(" ")[1] for name in names]

In [None]:
#find links to each candidate's list of speeches, statments, and press releases
link_of_sources = [c.find_all("a") for c in candidate_rows]
link_of_campaign_speeches = [l[0]['href'] for l in link_of_sources]
link_of_statements = [l[1]['href'] for l in link_of_sources]
link_of_press_releases = [l[2]['href'] for l in link_of_sources]

In [None]:
#generate urls
campaign_speeches_url = [base_url + l for l in link_of_campaign_speeches]
statements_url = [base_url + l for l in link_of_statements]
press_releases_url = [base_url + l for l in link_of_press_releases]

In [None]:
response_campaign_speeches = [requests.get(url) for url in campaign_speeches_url]
response_statements = [requests.get(url) for url in statements_url]
response_press_releases = [requests.get(url) for url in press_releases_url]

#html of candidate pages listing individual speeches
soup_campaign_speeches = [bs(response.text, "html.parser") for response in response_campaign_speeches]
soup_statements = [bs(response.text, "html.parser") for response in response_statements]
soup_press_releases = [bs(response.text, "html.parser") for response in response_press_releases]

In [None]:
# finds links to individual speeches - use indices 49 through (len - 5)
speech_url = [[base_url + speeches.select("tr")[i].a['href'].split("..")[1] for i in range(49, len(speeches.select("tr")) - 4)] for speeches in soup_campaign_speeches]
statements_url = [[base_url + speeches.select("tr")[i].a['href'].split("..")[1] for i in range(49, len(speeches.select("tr")) - 4)] for speeches in soup_statements]
press_url = [[base_url + speeches.select("tr")[i].a['href'].split("..")[1] for i in range(49, len(speeches.select("tr")) - 4)] for speeches in soup_press_releases]

In [None]:
response_speech = [[requests.get(s) for s in c] for c in speech_url]
response_statement = [[requests.get(s) for s in c] for c in statements_url]
response_press = [[requests.get(s) for s in c] for c in press_url]

#html of individual speeches
soup_speech = [[bs(response.text, "html.parser") for response in c] for c in response_speech]
soup_statement = [[bs(response.text, "html.parser") for response in c] for c in response_statement]
soup_press = [[bs(response.text, "html.parser") for response in c] for c in response_press]

In [None]:
#find titles of each text
speech_titles = [[to_text(s.findAll("span", {"class": "paperstitle"}))[0] for s in c] for c in soup_speech]
statement_titles = [[to_text(s.findAll("span", {"class": "paperstitle"}))[0] for s in c] for c in soup_statement]
press_titles = [[to_text(s.findAll("span", {"class": "paperstitle"}))[0] for s in c] for c in soup_press]

In [None]:
#find text of each text
speech_text = [[to_text(s.findAll("span", {"class": "displaytext"}))[0] for s in c] for c in soup_speech]
statement_text = [[to_text(s.findAll("span", {"class": "displaytext"}))[0] for s in c] for c in soup_statement]
press_text = [[to_text(s.findAll("span", {"class": "displaytext"}))[0] for s in c] for c in soup_press]

In [None]:
#find date of each text
speech_date = [[to_text(s.findAll("span", {"class": "docdate"}))[0] for s in c] for c in soup_speech]
statement_date = [[to_text(s.findAll("span", {"class": "docdate"}))[0] for s in c] for c in soup_statement]
press_date = [[to_text(s.findAll("span", {"class": "docdate"}))[0] for s in c] for c in soup_press]

In [None]:
#remove non-ASCII characters
clean_speech_text = [[s.encode('ascii',errors='ignore') for s in c] for c in speech_text]
clean_statement_text = [[s.encode('ascii',errors='ignore') for s in c] for c in statement_text]
clean_press_text = [[s.encode('ascii',errors='ignore') for s in c] for c in press_text]

clean_speech_titles = [[s.encode('ascii',errors='ignore') for s in c] for c in speech_titles]
clean_statement_titles = [[s.encode('ascii',errors='ignore') for s in c] for c in statement_titles]
clean_press_titles = [[s.encode('ascii',errors='ignore') for s in c] for c in press_titles]

In [None]:
#function takes in an index (which row are they on the start page?), a type ('c', 's', or 'p')
#and three lists of lists that we've generated
def generate_csv(index, type, title, text, date):
    header = ['Candidate', 'Party', 'Type', 'Date', 'Title', 'Speech']
    with open('csv/' + lastnames[index] + '_' + type + '.csv', 'w') as output_file:
        csv_writer = csv.writer(output_file)
        csv_writer.writerow(header)
        for i in range(len(title[index])):
            row = [names[index], 'R', type, date[index][i], title[index][i], text[index][i]]
            csv_writer.writerow(row)
    pandas.read_csv('csv/' + lastnames[index] + '_' + type + '.csv')

In [None]:
#for each candidate in names, generate three csv files
for i in range(len(names)):
    generate_csv(i, 'c', clean_speech_titles, clean_speech_text, speech_date)
    generate_csv(i, 's', clean_statement_titles, clean_statement_text, statement_date)
    generate_csv(i, 'p', clean_press_titles, clean_press_text, press_date)