# ToDo

* map speaker attribute entries from Qids to meaningful labels
  * save all dictionaries as JSONs

* Jana - loading Quotebank per year, discarding None speakers, saving as JSON


## Installing and importing dependencies, mounting to drive

In [None]:
!pip install tld
!pip install pyarrow
!pip install Wikidata
!pip install aspect_based_sentiment_analysis

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import json
import os
import bz2
import itertools 

# Import NLP library
import aspect_based_sentiment_analysis as absa
nlp = absa.load()

from google.colab import drive
drive.mount('/content/drive')

from wikidata.client import Client
wiki_client = Client()

Some layers from the model checkpoint at absa/classifier-rest-0.2 were not used when initializing BertABSClassifier: ['dropout_379']
- This IS expected if you are initializing BertABSClassifier from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertABSClassifier from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of BertABSClassifier were not initialized from the model checkpoint at absa/classifier-rest-0.2 and are newly initialized: ['dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Quotebank and discard 'None' speakers

In [None]:
# Iterate through the years of existing Quotebank files
for year in range(2015, 2021):

  path_to_file = f'/content/drive/MyDrive/Quotebank/quotes-{year}.json.bz2' 
  path_to_out = f'/content/drive/MyDrive/Quotebank_limunADA/quotes-no-nones-{year}.json.bz2'

  # If it already exists, skip it
  if os.path.isfile(path_to_out):
    print(f'\nFile for year {year} already exists. Moving on...')
    continue

  print(f'\nExtracting non-None quotations for year {year}')

  with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
      for instance in s_file:
        # loading a sample
        instance = json.loads(instance) 
        
        if instance['speaker'] == 'None':
          continue

        # writing in the new file
        d_file.write((json.dumps(instance)+'\n').encode('utf-8')) 

## Speaker attributes parquet

In [None]:
parquet_path = '/content/drive/MyDrive/Project datasets/speaker_attributes.parquet'
speaker_attributes = pd.read_parquet(parquet_path)

In [None]:
speaker_attributes.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


## Quotebank samples from 2019

In [None]:
quotes_19_sample = pd.read_json(
    '/content/drive/MyDrive/quotes-2019-nytimes.json.bz2', 
    compression='bz2', 
    lines=True
    )

In [None]:
 quotes_19_sample.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,"[Q16213953, Q20707104, Q43143598, Q58886302, Q...",2019-04-17 13:31:18,1,"[[James Fisher, 0.7475], [None, 0.2525]]",[https://www.nytimes.com/2019/04/17/realestate...,E
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,[Q105756],2019-04-02 14:58:33,2,"[[John Updike, 0.5856], [None, 0.4144]]",[https://www.nytimes.com/2019/04/02/opinion/vl...,E
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,[],2019-05-09 18:11:29,1,"[[None, 0.6493], [President Bill Clinton, 0.27...",[http://mobile.nytimes.com/2019/05/09/world/as...,E
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,[Q1855840],2019-10-31 16:45:15,3,"[[Xavier Becerra, 0.9065], [None, 0.0909], [St...",[http://www.nytimes.com/2019/10/31/technology/...,E
4,2019-01-04-001792,A Pile of Leaves.,,[],2019-01-04 10:00:07,1,"[[None, 0.8737], [Jason Fulford, 0.1263]]",[https://www.nytimes.com/2019/01/04/books/revi...,E


## Separating Trump quotes from the sample

In [None]:
TRUMP_ALIASES = ['Donald Trump', 'President Donald Trump', 'President Trump']

trump_quotes = quotes_19_sample[
  quotes_19_sample.apply(lambda row: row['speaker'] in TRUMP_ALIASES, axis=1)
  ]


In [None]:
trump_quotes.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
15,2019-02-15-009152,Asylum seekers in Mexico face a heightened ris...,President Donald Trump,[Q22686],2019-02-15 03:20:00,5,"[[President Donald Trump, 0.7752], [None, 0.22...",[http://www.abajournal.com/news/article/lawsui...,E
202,2019-05-13-016076,"Don't forget they're a member of NATO, and a v...",President Donald Trump,[Q22686],2019-05-13 00:00:00,68,"[[President Donald Trump, 0.6683], [None, 0.23...",[http://kazu.org/post/trump-greets-hungarys-ha...,E
244,2019-08-25-016871,I think it's a positive. Other people agree wi...,President Donald Trump,[Q22686],2019-08-25 23:19:22,7,"[[President Donald Trump, 0.6994], [None, 0.15...",[http://www.nytimes.com/2019/08/25/world/europ...,E
254,2019-08-31-024043,"In other words, they're running badly and they...",President Donald Trump,[Q22686],2019-08-31 00:21:27,2,"[[President Donald Trump, 0.7363], [None, 0.26...",[http://www.nytimes.com/2019/08/30/us/politics...,E
255,2019-05-09-051472,"In typical fashion, as soon as Trump Park was ...",President Trump,[Q22686],2019-05-09 09:25:21,1,"[[President Trump, 0.6722], [None, 0.3037], [D...",[https://www.nytimes.com/2019/05/09/nyregion/n...,E


## Get quotes mentioning given keywords (Biden in this case)

In [None]:
def get_mentions(quotes_df, keyword):
  """ 
  Returns a DataFrame of quotations containing the keyword in them. 
  """
  keyword = keyword.lower()
  mentions_bool = quotes_df.apply(
      lambda row: keyword in row['quotation'].lower(), axis=1
      )
  
  return quotes_df[mentions_bool]

In [None]:
biden_mentions = get_mentions(trump_quotes, 'biden')
biden_mentions.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
4302,2019-10-02-015565,China should start an investigation into the B...,President Donald Trump,[Q22686],2019-10-02 21:33:06,902,"[[President Donald Trump, 0.4752], [None, 0.3]...",[https://www.seattletimes.com/nation-world/nat...,E
5172,2019-09-25-097031,there's a lot of talk about Biden's son....,President Donald Trump,[Q22686],2019-09-25 00:00:00,167,"[[President Donald Trump, 0.6706], [None, 0.23...",[http://dailyherald.com/article/20190925/news/...,E
11201,2019-09-20-074323,Someone ought to look into Joe Biden.,President Donald Trump,[Q22686],2019-09-20 00:00:00,65,"[[President Donald Trump, 0.5329], [None, 0.24...",[https://www.rawstory.com/2019/09/defiant-trum...,E
17845,2019-04-16-026156,I believe it will be Crazy Bernie Sanders vs. ...,President Donald Trump,[Q22686],2019-04-16 00:00:00,14,"[[President Donald Trump, 0.5721], [None, 0.32...",[https://www.foxnews.com/politics/trump-bernie...,E
21544,2019-10-12-037463,quite a bit of background on Hunter Biden from...,Donald Trump,"[Q22686, Q27947481]",2019-10-12 01:03:48,2,"[[Donald Trump, 0.3729], [Michael Pillsbury, 0...",[http://mobile.nytimes.com/2019/10/11/opinion/...,E


## Add sentiment analysis columns (towards a keyword) to DataFrame

In [None]:
# Dictionary for mapping sentiment to index, corresponding to 
# the output of the NLP model from ABSA
SENTIMENT_INDEXING = {'neutral': 0, 'negative': 1, 'positive': 2}

def add_sentiment_columns(df_original, keyword, nlp_model):
  """ 
  Function that computes Aspect Based Sentiment Analysis towards the 
  given keyword, for each quote in the DataFrame.
  Also, it adds 3 columns (neutral, negative, positive) to the DataFrame, 
  containing sentiment scores corresponding to each of the columns.  
  """
  df = df_original.copy()
  for sentiment_key in SENTIMENT_INDEXING:
    df[sentiment_key] = 0

  for ind, row in df.iterrows():
    text = row['quotation'].lower()
    completed_task = nlp_model(text, aspects=[keyword])
    sentiment_scores = completed_task.examples[0].scores 

    for sentiment_key, sentiment_ind in SENTIMENT_INDEXING.items():
      df.loc[ind, sentiment_key] = sentiment_scores[sentiment_ind]

  return df

In [None]:
biden_sentiment = add_sentiment_columns(biden_mentions, 'biden', nlp)
biden_sentiment.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,neutral,negative,positive
4302,2019-10-02-015565,China should start an investigation into the B...,President Donald Trump,[Q22686],2019-10-02 21:33:06,902,"[[President Donald Trump, 0.4752], [None, 0.3]...",[https://www.seattletimes.com/nation-world/nat...,E,0.71422,0.164673,0.121108
5172,2019-09-25-097031,there's a lot of talk about Biden's son....,President Donald Trump,[Q22686],2019-09-25 00:00:00,167,"[[President Donald Trump, 0.6706], [None, 0.23...",[http://dailyherald.com/article/20190925/news/...,E,0.986754,0.005743,0.007503
11201,2019-09-20-074323,Someone ought to look into Joe Biden.,President Donald Trump,[Q22686],2019-09-20 00:00:00,65,"[[President Donald Trump, 0.5329], [None, 0.24...",[https://www.rawstory.com/2019/09/defiant-trum...,E,0.022231,0.002093,0.975676
17845,2019-04-16-026156,I believe it will be Crazy Bernie Sanders vs. ...,President Donald Trump,[Q22686],2019-04-16 00:00:00,14,"[[President Donald Trump, 0.5721], [None, 0.32...",[https://www.foxnews.com/politics/trump-bernie...,E,0.16582,0.293936,0.540244
21544,2019-10-12-037463,quite a bit of background on Hunter Biden from...,Donald Trump,"[Q22686, Q27947481]",2019-10-12 01:03:48,2,"[[Donald Trump, 0.3729], [Michael Pillsbury, 0...",[http://mobile.nytimes.com/2019/10/11/opinion/...,E,0.032083,0.003521,0.964396


In [None]:
biden_sentiment.sort_values('negative', ascending=False, inplace=True)

examples_number = 5
for cnt, (ind, row) in enumerate(biden_sentiment.iterrows()):
  if cnt == examples_number:
    break
    
  print(f"\n\nNEGATIVE SCORE: {row['negative']}")
  print(f"POSITIVE SCORE: {row['positive']}")
  print(f"NEUTRAL SCORE: {row['neutral']}")

  print(row['quotation'])



NEGATIVE SCORE: 0.9939144253730774
POSITIVE SCORE: 0.003951252903789282
NEUTRAL SCORE: 0.002134413458406925
Joe Biden was a disaster,


NEGATIVE SCORE: 0.9912710189819336
POSITIVE SCORE: 0.0073827290907502174
NEUTRAL SCORE: 0.001346296863630414
What Biden did is a disgrace. What his son did is a disgrace,


NEGATIVE SCORE: 0.9894962906837463
POSITIVE SCORE: 0.008044823072850704
NEUTRAL SCORE: 0.002458946080878377
Biden was never very smart. He was a terrible student. His gaffes are unbelievable. When I say something that you might think is a gaffe, it's on purpose; it's not a gaffe. When Biden say something dumb, it's because he's dumb.


NEGATIVE SCORE: 0.9756239652633667
POSITIVE SCORE: 0.00740540586411953
NEUTRAL SCORE: 0.016970649361610413
that Biden stopped the prosecution and a lot of people want to find out about that.


NEGATIVE SCORE: 0.9736927151679993
POSITIVE SCORE: 0.004033029079437256
NEUTRAL SCORE: 0.02227422408759594
I don't know what the hell happened to Biden - that

## Wikidata Python API

### Extracting a set of QIDs from a given column 

In [None]:
def get_qid_set(df, column_name, num_of_rows=None):
  """ 
  Given a DataFrame and column name, returns a set of QIDs in it. 
  """
  if num_of_rows is None:
    num_of_rows = len(df[column_name])

  # Join QIDs of each row (list -> string) to avoid working with np.array
  joint_qids_list = df[column_name].head(num_of_rows).to_frame().apply(
    lambda row: '' if row[0] is None else ','.join(row[0]), axis=1
    ).unique()

  # Iterate through the joint QIDs, split them, and add them to a set
  qids_set = set()
  for curr_joint_qids in joint_qids_list:
    for qid in curr_joint_qids.split(','):
      if not qid == '':
        qids_set.add(qid)

  return qids_set



def map_qids_to_labels(qids, wiki_client):
  """
  Given a set or list of QIDs, return a dictionary of format: {QID: label}
  We get the labels for each QID using the Wikidata client.
  """
  qids_labels_dict = dict()
  for qid in qids:
    try:
      # Multilingual to basic string
      qids_labels_dict[qid] = str(wiki_client.get(qid, load=True).label)
    except Exception:
      # In case the QID doesn't exist on Wikidata
      print(f'Problem with {qid}. Skipping...')

  return qids_labels_dict 


### Get **genders** QID-label mapping

In [108]:
print('Getting QIDs set')
gender_qids = get_qid_set(speaker_attributes, 'gender')

print('Getting labels from Wikidata')
gender_label_dict = map_qids_to_labels(gender_qids, wiki_client)

dict(itertools.islice(gender_label_dict.items(), 10))


{'Q106299064': 'Erkek',
 'Q1289754': 'neutrois',
 'Q15145782': 'transgender female',
 'Q1775415': 'feminine',
 'Q18116794': 'genderfluid',
 'Q189125': 'transgender person',
 'Q207959': 'androgyny',
 'Q301702': 'two-spirit',
 'Q505371': 'agender',
 'Q6636': 'homosexuality'}

In [None]:
OVERWRITE_EXISTING = False

# Save the mapping
genders_save_path = '/content/drive/MyDrive/Quotebank_limunADA/genders_qids_labels.json'

if OVERWRITE_EXISTING or not os.path.isfile(genders_save_path):
  with open(, 'w') as f:
      json.dump(gender_label_dict, f)

### Get **occupations** QID-label mapping

In [109]:
print('Getting QIDs set')
occupation_qids = get_qid_set(speaker_attributes, 'occupation')

print('Getting labels from Wikidata')
occupation_label_dict = map_qids_to_labels(occupation_qids, wiki_client)

dict(itertools.islice(occupation_label_dict.items(), 10))


{'Q11124885': 'martial artist',
 'Q1157586': 'Daijō-daijin',
 'Q1241157': 'Freedom Fighters',
 'Q35377': 'Efik',
 'Q4892566': 'Celedon',
 'Q59314023': 'Knight Hospitaller',
 'Q62116991': 'environmental toxicologist',
 'Q66363203': 'art photographer',
 'Q690928': 'Schultheiß',
 'Q97768332': 'anciens ouvriers'}

In [97]:
OVERWRITE_EXISTING = False

# Save the mapping
occupations_save_path = '/content/drive/MyDrive/Quotebank_limunADA/occupations_qids_labels.json'

if OVERWRITE_EXISTING or not os.path.isfile(occupations_save_path):
  with open(occupations_save_path, 'w') as f:
      json.dump(occupation_label_dict, f)

### Get **religions** QID-label mapping

In [110]:
print('Getting QIDs set')
religion_qids = get_qid_set(speaker_attributes, 'religion')

print('Getting labels from Wikidata')
religion_label_dict = map_qids_to_labels(religion_qids, wiki_client)

dict(itertools.islice(religion_label_dict.items(), 10))


{'Q10452632': 'Church of Christ',
 'Q1089816': 'Church of South India',
 'Q1258552': 'Celtic Christianity',
 'Q2965829': 'Nicene Christianity',
 'Q425381': 'Hanif',
 'Q5043': 'Christianity',
 'Q5883857': 'Holo',
 'Q624477': 'Christian Science',
 'Q6908412': 'Moorish Orthodox Church of America',
 'Q910556': 'Brethren in Christ Church'}

In [99]:
OVERWRITE_EXISTING = False

# Save the mapping
religions_save_path = '/content/drive/MyDrive/Quotebank_limunADA/religions_qids_labels.json'

if OVERWRITE_EXISTING or not os.path.isfile(religions_save_path):
  with open(religions_save_path, 'w') as f:
      json.dump(religion_label_dict, f)

### Get **nationality** QID-label mapping

In [111]:
print('Getting QIDs set')
nationality_qids = get_qid_set(speaker_attributes, 'nationality')

print('Getting labels from Wikidata')
nationality_label_dict = map_qids_to_labels(nationality_qids, wiki_client)

dict(itertools.islice(nationality_label_dict.items(), 10))


{'Q109128': 'Gwynedd',
 'Q140472': 'Silesians',
 'Q1443132': 'Kalaureia',
 'Q178630': 'Portuguese',
 'Q218': 'Romania',
 'Q284964': 'Cao',
 'Q414': 'Argentina',
 'Q63158027': 'Qajar Iran',
 'Q639100': 'Principality of Reuss-Gera',
 'Q6714285': 'MAM'}

In [101]:
OVERWRITE_EXISTING = False

# Save the mapping
nationality_save_path = '/content/drive/MyDrive/Quotebank_limunADA/nationalities_qids_labels.json'

if OVERWRITE_EXISTING or not os.path.isfile(nationality_save_path):
  with open(nationality_save_path, 'w') as f:
      json.dump(nationality_label_dict, f)