In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Filtering the data interesting us 

For our study, we are only interested in the quotes talking about the US. The following cells allow us to only recuperate the quotes with the keywords : "US ", "USA", "U.S" and "United States". 

Although very prelimiary, this filtering method allows us to have a satisfactory amount of data on which to start the analysis.

In [None]:
import bz2
import json

years = ["2020"]
keywords = ["Trump", "New York", "Washington", "Biden", "Obama", "Warren", 
            "Sanders", "Hollywood", "L.A.", "Harris", "Cohen", "Apple", "Nixon", 
            "Clinton", "Williams", "Bush", "Lincoln", "Kennedy", "American", "Pompeo",
            "Donald J. Trump", "Ocasio-Cortez", "Pence", "Tesla", "Don Jr.", "Nelson",
            "Tarantino", "Baldwin", "Bannon", "Bolton", "Barr"]

for i in years : 
  path_to_file = '/content/drive/MyDrive/us_data/Quotebank/quotes-'+i+'.json.bz2' 
  path_to_out = '/content/drive/MyDrive/us_data/Filtered data/quotes-'+i+'-us_speakers_2020.json.bz2'
  with bz2.open(path_to_file, 'rb') as s_file:
      with bz2.open(path_to_out, 'wb') as d_file:
          for instance in s_file:
              instance = json.loads(instance) # loading a sample
              quote = instance['quotation'] # extracting list of links
              if any(word in quote for word in keywords) : 
                d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

In [None]:
import pandas as pd

# Get the index of None speaker quo
def drop_none_speaker(year, df):
    print("Year : {} with {} quotes".format(year, len(df)))
    drop_mask = df[df.speaker == "None"].index
    df_dropped = df.drop(drop_mask)
    print("{} quotes are removed".format(len(drop_mask)))
    return df_dropped

US_data = pd.DataFrame()
for year in [2015, 2016, 2019, 2020] :
    year_df = pd.read_json('/content/drive/MyDrive/us_data/Filtered data/quotes-{}-us.json.bz2'.format(year), lines=True, compression='bz2')
    year_df = drop_none_speaker(year, year_df)
    US_data = pd.concat([US_data, year_df], axis=0)

    speaker_df = pd.read_json('/content/drive/MyDrive/us_data/Filtered data/quotes-{}-us_speakers_{}.json.bz2'.format(year, year), lines=True, compression='bz2')
    speaker_df = drop_none_speaker(year, speaker_df)
    US_data = pd.concat([US_data, speaker_df], axis=0)

US_data.sample(n=10)

Year : 2015 with 190401 quotes
65755 quotes are removed
Year : 2015 with 456925 quotes
161782 quotes are removed
Year : 2016 with 135853 quotes
49131 quotes are removed
Year : 2016 with 456897 quotes
173200 quotes are removed
Year : 2019 with 255897 quotes
96480 quotes are removed
Year : 2019 with 597664 quotes
220407 quotes are removed
Year : 2020 with 60896 quotes
23040 quotes are removed
Year : 2020 with 168203 quotes
61484 quotes are removed


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
59410,2016-02-23-025684,"Every country, including the US and Singapore,...",Ajay Singh,"[Q15734101, Q20630466, Q25991768, Q4699657, Q4...",2016-02-23 19:26:00,3,"[[Ajay Singh, 0.6389], [None, 0.2201], [Wolfga...",[http://www.business-standard.com/article/comp...,E
29052,2015-12-11-008074,Another promotional cut to enable motorists to...,Simon Williams,"[Q15647498, Q17020826, Q4470100, Q57901554, Q5...",2015-12-11 05:06:00,1,"[[Simon Williams, 0.9663], [None, 0.0337]]",[http://express.co.uk/news/uk/625908/Asda-Morr...,E
106360,2020-01-30-125938,"Years ago, somebody connected [ Run-D.M.C. ] t...",Rev Run,[Q742642],2020-01-30 19:58:51,3,"[[Rev Run, 0.8924], [None, 0.0905], [Kobe Brya...",[https://www.okayplayer.com/culture/rev-run-dm...,E
360926,2016-05-10-133972,We are thrilled that seven of Canada's leading...,Jennifer Bailey,"[Q17385608, Q57434480]",2016-05-10 13:16:26,2,"[[Jennifer Bailey, 0.8047], [None, 0.1953]]",[http://montrealgazette.com/storyline/apple-pa...,E
424713,2015-11-18-093006,There should be a pause until we have the conf...,John McCain,[Q10390],2015-11-18 02:57:45,1,"[[John McCain, 0.8025], [None, 0.1047], [Ted C...",[http://feeds.huffingtonpost.com/c/35496/f/677...,E
576688,2019-08-21-011522,But it's time for Republicans and President Tr...,Chris Murphy,"[Q1077594, Q20022484, Q2964809, Q5107535, Q510...",2019-08-21 08:30:00,1,"[[Chris Murphy, 0.8118], [None, 0.1787], [Mitc...",[http://www.msn.com/en-us/news/politics/hobble...,E
121702,2015-10-06-092616,The judgment means businesses that use Safe Ha...,David Smith,"[Q1176671, Q1176674, Q16194812, Q16201315, Q19...",2015-10-06 16:52:47,1,"[[David Smith, 0.8519], [None, 0.1481]]",[http://www.computerweekly.com/news/4500254944...,E
146449,2015-07-30-049764,Illegals flood the country; illegals get drive...,Chuck Norris,"[Q17083833, Q2673]",2015-07-30 06:01:20,1,"[[Chuck Norris, 0.5232], [None, 0.4768]]",[http://www.examiner.com/article/liberals-clai...,E
20033,2020-02-06-096130,The US is one of the largest defence exporters...,Rajnath Singh,[Q3506475],2020-02-06 15:01:35,1,"[[Rajnath Singh, 0.8759], [None, 0.0942], [Mar...",[https://www.hindustantimes.com/india-news/ind...,E
242163,2019-03-12-109490,You made it clear you were not thinking about ...,Gayle King,[Q5528805],2019-03-12 17:29:28,1,"[[Gayle King, 0.6347], [Jeff Flake, 0.2507], [...",[https://www.newsbusters.org/blogs/nb/scott-wh...,E


In [None]:
speaker_df = pd.read_parquet("/content/drive/MyDrive/us_data/speaker_attributes.parquet")
speaker_df = speaker_df[['id', 'nationality']]
speaker_df.head()