In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import threading, requests
from queue import Queue
from time import sleep
import pdb

In [None]:
class __data_mining__():
    def __init__(self,inp_url):
        self.inp_url = inp_url
        self.out_list = []
        self.d_frame = pd.DataFrame()
        self.col_headers = ['full date of debate','year of debate','classification if speech is : General, Primary-D (democratic), Primary-R (republican)','link/URL to debate','title of debate','participant names as a list [name 1, name 2 etc]','moderator name','full transcript of debate','text as a dictionary{speaker_name: (first speech, second speech) etc }','text as a list of lists [ [speaker 1, text1] etc ]']
    def data_mine(self,row):
        # initialize text variable for each row
        rows_text = ''
        try:
            # read date and title. Also remove unwanted text
            rows_text = row.text.strip().replace('\n','')
            # split with seperator to read date and speach title seperately
            full_date_debate,title_of_debate = rows_text.split('\xa0')[0],rows_text.split('\xa0')[1]
            # classification of speech - General, Primary-D (democratic), Primary-R (republican)
            if 'Democratic' in title_of_debate:
                speach_type = 'Primary-D (democratic)'
            elif 'Republican' in title_of_debate:
                speach_type = 'Primary-R (republican)'
            else:
                speach_type = 'General'
            # read link for each speach, if available
            speach_link = row.find('a')['href']
            print(speach_link)
            # get request on speach link to read content, and scrape required data
            speach_resp = requests.get(speach_link)
            speach_soup = BeautifulSoup(speach_resp.text, 'lxml')
            page = speach_soup.find("div", attrs={"class":"field-docs-content"})
            # read participant names as a list [name 1, name 2 etc]
            # for handling single or multiple participants
            if ';' in page.find_all('p')[0].text:
                participant_list = page.find_all('p')[0].text.split(';')[:-1]
                participant_list = [each.replace('\n','') for each in participant_list]
            else:
                participant_list = [page.find_all('p')[0].text]
            # read moderator name, as there are more then one moderator hence providing 
            # list of moderator names
            # for handling single or multiple moderators
            if ';' in page.find_all('p')[1].text:
                moderator_list = page.find_all('p')[1].text.split(';')[:-1]
                moderator_list = [each.replace('\n','') for each in moderator_list]
            else:
                moderator_list = [page.find_all('p')[1].text]
            # read full transcript of debate, text as a dictionary, speaker name
            # text as a list of lists
            full_transript = ''
            text_as_dict = {}
            text_as_list = []
            full_transript_list = page.find_all('p')
            # exclude first two paragraphs as they are for participants and moderators
            for i in range(2,len(full_transript_list)):
                # handling multiple paragraphs of debate for each speaker
                try:
                    each_transript = full_transript_list[i]
                    speaker_name = each_transript.strong.text.strip(':')
                    full_transript = full_transript+' '+each_transript.text.strip()
                    # text as a dictionary {speaker_name: (first speech, second speech) etc }
                    if speaker_name in text_as_dict.keys():
                        text_as_dict[speaker_name] = text_as_dict[speaker_name].append(each_transript.text.split(':')[1])
                    else:
                        text_as_dict[speaker_name] = [each_transript.text.split(':')[1]]
                    # text as a list of lists
                    text_as_list.append([speaker_name,each_transript.text.split(':')[1]])
                except Exception as e:
                    # nested excepttional handler to different data patterns
                    try:
                        text_as_list.append([speaker_name,each_transript.text.strip()])
                        text_as_dict[speaker_name] = [each_transript.text.strip()]
                    except Exception as e:
                        pass
            self.out_list.append([full_date_debate,full_date_debate.split(',')[1].strip(),speach_type,speach_link,title_of_debate,participant_list,moderator_list,full_transript,text_as_dict,text_as_list])
        except Exception as e:
            # print(row.text.strip())
            pass
    def start_scrape(self):
        # perform get request on input link
        resp = requests.get(self.inp_url)
        # create soup object for page text
        soup = BeautifulSoup(resp.text, 'lxml')
        table = soup.find("div", attrs={"class":"field-body"})
        # read all dates and speach types from page
        rows = table.find_all('tr')
        # perform multithreading to process multiple debate link for each thread
        def threader():
            while True:
                # gets an worker from the queue
                x = q.get()
                # Run the example job with the avail worker in queue (thread)
                self.data_mine(x)
                # completed with the job
                q.task_done()
        q = Queue()
        # initialize 40 threads for handling debate links
        print('start processing')
        for x in range(40):
            t = threading.Thread(target=threader)
            # classifying as a daemon, so they will die when the main dies
            t.daemon = True
            sleep(0.3)
            # begins, must come after daemon definition
            t.start()
        # Jobs assigned.
        # iterate through each speach to read content
        for row in rows:
            # self.data_mine(row)
            q.put(row)
        q.join ()
        # create dataframe from the output list
        full_dict = {k: [x[i] for x in self.out_list] for i, k in enumerate(self.col_headers)}
        self.d_frame = pd.DataFrame.from_dict(full_dict)
        # write dataframe to csv file
        f_paht_csv = r'Scraping_CSV.csv'
        f_paht_xls = r'Scraping_Xls.xlsx'
        # self.d_frame.to_csv(f_paht_csv, index=None, header=True)
        writer = pd.ExcelWriter(f_paht_xls, engine='xlsxwriter', options={'strings_to_urls': False})
        self.d_frame.to_excel(writer, index=False, columns=self.col_headers)
        writer.save()

In [None]:
# Create object for class and call main method
class_obj = __data_mining__('https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/presidential-candidates-debates-1960-2016')
class_obj.start_scrape()