In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

## Load QuoteBank Dataset
We refer to the pipeline provided in Google Colab to load the quotations from 2015 to 2020.

Due to the large size of data, when loading them, we just load the data said by the senate election since 2015 candidates. The dataset for senate elections is obtained from MIT Election Data and Science Lab [1].

We also provide the codes for loading these senate candidate quotations below, but those are run ahead in google colab (takes around 30min for the each year's data). We did not re-run them here.

[1] MIT Election Data and Science Lab, 2017, "U.S. Senate 1976–2020", https://doi.org/10.7910/DVN/PEJ5QU, Harvard Dataverse, V5, UNF:6:cIUB3CEIKhMi9tiY4BffLg== [fileUNF]

In [3]:
# Pipeline obtained from the course google colab
from tld import get_tld

def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.tld


In [5]:
# Load the Senate Election Result data 
import pandas as pd
senate_file = 'SenateData/1976-2020-senate.csv'
senate_election = pd.read_csv(senate_file, encoding= 'unicode_escape')
senate_election.sample(10) # demonstrate the format of election data

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party_detailed,writein,mode,candidatevotes,totalvotes,unofficial,version,party_simplified
1385,1996,MASSACHUSETTS,MA,25,14,3,US SENATE,statewide,gen,False,ROBERT C. STOWE,NATURAL LAW,False,total,7169,2555942,False,20210114,OTHER
468,1982,NEW JERSEY,NJ,34,22,12,US SENATE,statewide,gen,False,ROBERT T. BASTIEN,GRASSROOTS,False,total,2955,2193945,False,20210114,OTHER
3453,2020,COLORADO,CO,8,84,62,US SENATE,statewide,gen,False,RAYMON ANTHONY DOANE,LIBERTARIAN,False,total,56262,3235790,False,20210114,LIBERTARIAN
3597,2020,TENNESSEE,TN,47,62,54,US SENATE,statewide,gen,False,BILL HAGERTY,REPUBLICAN,False,total,1840926,2959761,False,20210114,REPUBLICAN
2704,2012,HAWAII,HI,15,95,82,US SENATE,statewide,gen,False,MAZIE K. HIRONO,DEMOCRAT,False,total,269489,437159,False,20210114,DEMOCRAT
342,1980,MISSOURI,MO,29,43,34,US SENATE,statewide,gen,False,MARTHA PETTIS,SOCIALIST WORKERS,False,total,6707,2066965,False,20210114,OTHER
1293,1994,TEXAS,TX,48,74,49,US SENATE,statewide,gen,False,KAY BAILEY HUTCHISON,REPUBLICAN,False,total,2604218,4279940,False,20210114,REPUBLICAN
2705,2012,INDIANA,IN,18,32,22,US SENATE,statewide,gen,False,RICHARD E. MOURDOCK,REPUBLICAN,False,total,1133621,2560102,False,20210114,REPUBLICAN
2460,2010,CALIFORNIA,CA,6,93,71,US SENATE,statewide,gen,False,,,True,total,5,10000160,False,20210114,OTHER
1806,2000,WISCONSIN,WI,55,35,25,US SENATE,statewide,gen,False,EUGENE A. HEM,INDEPENDENT,False,total,9555,2540083,False,20210114,OTHER


In [6]:
# Load the names of candidates since 2016
# The NaN names are dropped, and repeated names are dropped
senate_candidates = senate_election[senate_election['year'] >= 2016]['candidate'].dropna().unique()

**We donot run the cell below because takes too long**

In [None]:
# Load each year's data, filtering out the quotations said by senates
# For now, we ignore name variations of the same persion, just focusing the names
# provided in the senate candidates dataset
import bz2
import json

path_to_file = 'RawData/quotes-2015.json.bz2' 
path_to_out = 'SenateData/quotes-2015-senates.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            if instance['speaker'].upper() in senate_candidates: # only load the quotes said by wanted 
              urls = instance['urls'] # extracting list of links
              domains = []
              for url in urls:
                  tld = get_domain(url)
                  domains.append(tld)
              instance['domains'] = domains # updating the sample with domain name
              d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

## Process Senate Candidate Data

In [2]:
# Load the stored quotations said by senate candidates since 2015 (by year)
senate_quote_20 = pd.read_json('SenateData/quotes-2020-senates.json.bz2', lines=True, compression='bz2')
senate_quote_19 = pd.read_json('SenateData/quotes-2019-senates.json.bz2', lines=True, compression='bz2')
senate_quote_18 = pd.read_json('SenateData/quotes-2018-senates.json.bz2', lines=True, compression='bz2')
senate_quote_17 = pd.read_json('SenateData/quotes-2017-senates.json.bz2', lines=True, compression='bz2')
senate_quote_16 = pd.read_json('SenateData/quotes-2016-senates.json.bz2', lines=True, compression='bz2')
senate_quote_15 = pd.read_json('SenateData/quotes-2015-senates.json.bz2', lines=True, compression='bz2')