In [351]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pickle

### 1. Get debate page urls and dates by party.

In [5]:
def get_page(url):
    """
    Gets movie page as soup object for the provided url.
    Args:
        url (str): url of movie page
    Returns:
        movie_page (soup object): html of movie page
    """
    page = BeautifulSoup(requests.get(url).text, "lxml")
    return page

In [160]:
main_page = get_page('http://www.presidency.ucsb.edu/debates.php')

In [204]:
dates = []
description = []
links = []

for row in main_page.select('.docdate'):
    try:
        if row.text[0].isalpha():
            dates.append(row.text)
    except:
        continue
        
for row in main_page.select('.doctext'):
    description.append(row.text)
    try:
        links.append(row.find('a').get('href'))
    except:
        links.append('')

In [256]:
links_df = pd.DataFrame({'date' : dates, 'desc' : description, 'url' : links})

### 2. Get debate text by candidate, party, and date.

In [328]:
def loopUntilB(text, firstElement):
    """Get all text through next specified tag."""
    try:
        text += firstElement.string.strip()
    except:
        text += ''
    nextBTag = firstElement.find_next('b')
    if (firstElement.next.next == nextBTag):             
        return text
    try:
        return loopUntilB(text, firstElement.next)
    except:
        return loopUntilB(text, firstElement.next.next)

In [339]:
def get_all_debates(url_dict, candidate_list):
    """Get all debate text by candidate and date."""
    dates = []
    candidates = []
    text = []
    regex = '|'.join(candidate_list)
    for date in url_dict:
        soup = get_page(url_dict[date])
        cand_results = soup.find_all(text = re.compile(regex))
        for spiel in cand_results:
            dates.append(date)
            candidates.append(spiel[ : -1])
            text.append(loopUntilB('', spiel.next))
    return {'date' : dates, 'candidate' : candidates, 'text' : text}

In [273]:
dem_urls = {pd.to_datetime(row.date):row.url for ind, row in links_df[6:12].iterrows()}
gop_urls = {pd.to_datetime(row.date):row.url for ind, row in links_df[15:25].iterrows()}
dem_cands = ['CLINTON', 'SANDERS']
gop_cands = ['TRUMP', 'CRUZ', 'RUBIO', 'CARSON', 'KASICH']

In [343]:
dem_debates = get_all_debates(dem_urls, dem_cands)
gop_debates = get_all_debates(gop_urls, gop_cands)

In [352]:
with open('transcripts/dem_debates.pkl', 'w') as picklefile:
    pickle.dump(dem_debates, picklefile)
    
with open('transcripts/gop_debates.pkl', 'w') as picklefile:
    pickle.dump(gop_debates, picklefile)