# STEP 1: Presidential and Vice Presidential Debate scraper

## Polisticians Semester Project
### data sourced from CPD website

In [3]:
# import statements for packages to be used later

from bs4 import BeautifulSoup, SoupStrainer
from urllib.request import Request, urlopen
import pandas as pd
import time
import io
from selenium import webdriver
from ftfy import fix_encoding


## Create the webscraper function

In [8]:
def transcript_scraper():
    
    # Option so that selenium doesn't open a new Chrome window with each pull
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    
    t_0 = time.time() # will use to time the process later
     
    # input headers in a dict to bypass issue loading transcript site
    
    hd = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
           'Accept-Encoding': 'none',
           'Accept-Language': 'en-US,en;q=0.8',
           'Connection': 'keep-alive'}
    
    # URL base, from which we pull the set of links 
    
    root = 'https://www.debates.org/voter-education/debate-transcripts'
    
    # send request to site with headers to bypass Forbidden issue
    req = Request(root, headers = hd)
    
    # read site
    
    html_page = urlopen(req).read()
    
    # create HTML "soup" from which we will pull our information
    
    soup = BeautifulSoup(html_page, "lxml")
    
    # initiate web driver for Chrome
    driver = webdriver.Chrome(options=options)
    
    #use driver to open root url specified earlier
    driver.get(root)
    
    links = [] # empty list to which we will append the links to the transcripts
    
    for link in soup.findAll('a'): # locate the link
        links.append(str(link.get('href'))) # append the link to the link list
    
    # we only want the data if the link contains 'transcript', which is the convention used by the CPD
    t = [i for i in links if 'transcript' in i] 
    
    t = list(set(t)) # use set to ensure that there won't be anyu 
    
    fin_list = [] # empty list to which we will input the transcript strings
    
    for i in t: # iterate through our list of links
        
        loop_time = time.time()

        url = 'https://www.debates.org/' + str(i) # create the transcript URL
        
        #print(url) # for testing
        
        #Option so that selenium doesn't open a new Chrome window during each loop
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        
        #initiate web driver
        driver = webdriver.Chrome(options=options)
        
        #use driver to open url
        driver.get(url)
        
        #wait three seconds to load page, just in casse
        time.sleep(3)
        
        #extract page HTML and parse with BeautifulSoup
        html=driver.page_source
        soup=BeautifulSoup(html,'html.parser')
        
        # open file for appending ('a') # test appending data to text
        #f = io.open('debate_final.txt', 'a', encoding = 'utf-8')
        
        # transcript name and date are tagged with 'h1' on the website, so we pull those data
        h = soup('h1')
        h = str(h)[1:-1].replace('<h1>', '').replace('</h1>', '') # replace the taggings so the data are clean
        #print(h) # print the transcript for testing
          
        tr = str(soup('p')) # pull each block of the transcript parsed above
        spl_tr = tr.split('</p>') # split the transcript into each speaker's block of text
        
        
        ## Speaker identification and transcript cleansing steps ##
        
        
        l = 1 # will use this as a counter to indicate the transcript line 
        speaker = ''
        for j in spl_tr:
            fix_encoding(j)
            j = j.replace('<p>', '')
            j = j.replace('</p>', '') # similarly remove HTML tagging from each block of transcript text
            j = j[2:].strip() # set the transcript block
            
            # find first instance of a space in the block of text. If word is Mr., Ms., or Mrs., remove it 
            
            if j.split(' ', 1)[0].strip() in['MR.', 'MS.', 'MRS.' ]: 
                temp = j.split(' ', 1)[1].strip()
                first_word = temp.split(' ', 1)[0]
                #print(type(first_word), first_word)
            else:
                
                # otherwise, take the first word from the block, which will be the speaker name in most instances
                
                first_word = j.split(' ', 1)[0].strip() 
            
            # the majority of transcripts follow the convention <Speaker>: <Transcript>. 
            # We want to check each block of text's first word for a colon to see if we have a speaker
            
            try: 
                    
                last_char = first_word[-1] # pull the last character in the first string
                
            except:
                last_char = '' # if there is an error due to empty data, set the string to a blank

            #print(last_char)
            try:
            
            # our initial stab at setting a speaker for each block of text sets the speaker as the word before the colon
                if last_char == ':' and first_word.upper() == first_word:
                    #print(True)
                    speaker = first_word.replace(':', '') # replace the colon
                    #print(speaker)
            except:
                
                # if there's an error, the data aren't in the <Speaker>: <Transcript> format in which we are interested, 
                # so we skip to next the next transcript
                # BUT, in this case, if there's no speaker in the text, we want to keep the prior speaker since this 
                # block of text is likely associated with the same speaker. We tested this later on
                
                continue 
                    
            fin_list.append((l, h, speaker, j)) # append the line count, debate title, speaker, and 
            #f.write(str(l) +',' + h + ',' + speaker + ',' +  j)
            
            l += 1 # add one to the line count
        
        # How many loops have run and in how long?
        
        print('{0} loops for {1} took {2: .2f} seconds.'.format(l, h, time.time()-loop_time))

     
        ######################################     
    
    # once we've looped through each debate link, collecting each block of text, corresponding speaker,
    # line count and the debate title and date, we need to create a data from from this list
    
    
    # set columns from list created in loop and insert data
    df = pd.DataFrame(fin_list, columns = ['Line Count', 'Debate', 'Speaker', 'Transcript']) 
    df.to_csv('Transcripts_df.csv', index = False) # save dataframe to csv (no index)
        
    print('Finished in {0: .2f} seconds'.format(time.time()-t_0)) # let the user know the process worked
    
    #return fin_list # have the function return the list, if you'd like


## Run the function!!

In [9]:
# run the function to pull the transcripts
transcript_scraper()

174 loops for October 3, 2000 Transcript took  6.57 seconds.
114 loops for October 5, 2000 Debate Transcript took  6.58 seconds.
20 loops for Debate Transcript Translations took  12.46 seconds.
94 loops for October 6, 1976 Debate Transcript took  6.48 seconds.
686 loops for October 8, 2004 Debate Transcript took  8.73 seconds.
479 loops for October 19, 1992 Debate Transcript took  6.51 seconds.
119 loops for October 9, 1996 Debate Transcript took  8.52 seconds.
131 loops for October 28, 1980 Debate Transcript took  6.46 seconds.
75 loops for September 26, 1960 Debate Transcript took  6.71 seconds.
60 loops for Debate Transcripts took  6.42 seconds.
178 loops for October 17, 2000 Debate Transcript took  13.61 seconds.
547 loops for October 16, 2012 Debate Transcript took  24.81 seconds.
156 loops for October 5, 1988 Debate Transcripts took  6.55 seconds.
124 loops for October 11, 1984 Debate Transcript took  9.00 seconds.
676 loops for October 11, 2012 Debate Transcript took  6.58 secon

### Script typically takes approx. 6.5 minutes to extract all of the transcripts
### Output is the transcripts_df.csv file