In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
response = requests.get('http://www.presidency.ucsb.edu/debates.php')
soup = BeautifulSoup(response.text)

In [3]:
debate_links = soup.find_all('td', attrs={'class':'doctext'})  #Find *all*
print len(debate_links)

61


In [4]:
transcript_urls = []
n_sans_links = 0
for i in debate_links:
    try:
        transcript_urls.append(i.select('a')[0].get('href'))
    except IndexError:
        n_sans_links += 1
        
print 'found %i urls, %i listings with no link.' % (len(transcript_urls), n_sans_links)

found 49 urls, 12 listings with no link.


In [100]:
response1 = requests.get(transcript_urls[0])
response2 = requests.get(transcript_urls[-16])

In [106]:
# html starts after a fat block of javascript and XML.
# read the page after that point

# new pages have the participants names in bold
start1 = response1.text.index('<span class="displaytext">')
soup1 = BeautifulSoup(response1.text[start1:])

# old pages do not bold the participants names
start2 = response2.text.index('<span class="displaytext">')
soup2 = BeautifulSoup(response2.text[start2:])

In [5]:
import re

In [6]:
def parse_speakers_and_quotes(page):
    """
    Extracts names of every person involved with the debate and 
    collects all of their quotes in a list.
    
    :param page: html page
    
    :returns dict: {speaker name: [quotes]}
    """
    
    # start at appropriate spot in page
    start = page.text.index('<span class="displaytext">')
    page_soup = BeautifulSoup(page.text[start:])
    
    speaker_dict = dict()

    prev_speaker = None
    for i in page_soup.find_all('p'):

        try: # search for name of person speaking
            curr_speaker = re.findall('[A-Z]+:', i.text)[0]
            prev_speaker = curr_speaker
            quote = re.split(':', i.text)[1]

        except IndexError: # if name not in line
            quote = i.text

        if prev_speaker not in speaker_dict:
            # add speaker to speaker_dict with list of quotes as value
            speaker_dict[prev_speaker] = [quote]
        else:
            # append quote to speaker's list of quotes
            speaker_dict[prev_speaker].append(quote)
    return speaker_dict

In [7]:
from requests_futures.sessions import FuturesSession

session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in transcript_urls]

debate_transcripts = [parse_speakers_and_quotes(future.result()) for future in futures]

In [10]:
import dill

In [11]:
dill.dump(debate_transcripts, open('debate_transcripts_list.dill', 'wb'))

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [85]:
tfidf = TfidfVectorizer(ngram_range=(2,3), stop_words='english', min_df=)

In [87]:
tfidf.fit(speaker_dict['BUSH'])

TfidfVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [88]:
tfidf.vocabulary_

{u'purely waste weren': 2113,
 u'tough calls social': 2787,
 u'republicans going': 2223,
 u'head didn': 1171,
 u'flexible freeze': 916,
 u'choice know don': 329,
 u'soviets modernizing continue': 2467,
 u'strategic weapons': 2530,
 u'office social security': 1802,
 u'come long long': 363,
 u'way did': 2999,
 u'principle asymmetrical': 2058,
 u'doctor said beautiful': 678,
 u'expert national defense': 837,
 u'ran conviction': 2152,
 u'democratic primary': 608,
 u'country governor talks': 474,
 u'home went doctor': 1222,
 u'percent remember': 1944,
 u'deserves word': 622,
 u'athletics corny': 148,
 u'seen precinct politics': 2390,
 u'sound think flexible': 2454,
 u'leader think': 1428,
 u'going discipline': 1012,
 u'weren protected congress': 3044,
 u'years want': 3127,
 u'president taking': 2030,
 u'credit president': 503,
 u'tremendously think facing': 2801,
 u'defense possible': 569,
 u'criterion ll': 511,
 u'andrea didn predicate': 105,
 u'said said': 2301,
 u'25 percent years': 1,
 

In [91]:
countvec = CountVectorizer(ngram_range=(2,3), stop_words='english')

In [94]:
countvec.fit(speaker_dict['BUSH'])
countvec

CountVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)