# ToDo

* map speaker attribute entries from Qids to meaningful labels
  * save all dictionaries as JSONs

* Jana - loading Quotebank per year, discarding None speakers, saving as JSON


## Mounting to drive + loading Quotebank

In [2]:
!pip install tld
!pip install pyarrow



In [49]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import json

from google.colab import drive
drive.mount('/content/drive')

from wikidata.client import Client
client = Client()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Speaker attributes parquet

In [3]:
parquet_path = '/content/drive/MyDrive/Project datasets/speaker_attributes.parquet'
speaker_attributes = pd.read_parquet(parquet_path)

In [4]:
speaker_attributes.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


## Quotebank samples from 2019

In [5]:
quotes_19_sample = pd.read_json(
    '/content/drive/MyDrive/quotes-2019-nytimes.json.bz2', 
    compression='bz2', 
    lines=True
    )

In [6]:
 quotes_19_sample.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,"[Q16213953, Q20707104, Q43143598, Q58886302, Q...",2019-04-17 13:31:18,1,"[[James Fisher, 0.7475], [None, 0.2525]]",[https://www.nytimes.com/2019/04/17/realestate...,E
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,[Q105756],2019-04-02 14:58:33,2,"[[John Updike, 0.5856], [None, 0.4144]]",[https://www.nytimes.com/2019/04/02/opinion/vl...,E
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,[],2019-05-09 18:11:29,1,"[[None, 0.6493], [President Bill Clinton, 0.27...",[http://mobile.nytimes.com/2019/05/09/world/as...,E
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,[Q1855840],2019-10-31 16:45:15,3,"[[Xavier Becerra, 0.9065], [None, 0.0909], [St...",[http://www.nytimes.com/2019/10/31/technology/...,E
4,2019-01-04-001792,A Pile of Leaves.,,[],2019-01-04 10:00:07,1,"[[None, 0.8737], [Jason Fulford, 0.1263]]",[https://www.nytimes.com/2019/01/04/books/revi...,E


## Separating Trump quotes from the sample

In [9]:
TRUMP_ALIASES = ['Donald Trump', 'President Donald Trump', 'President Trump']

trump_quotes = quotes_19_sample[
  quotes_19_sample.apply(lambda row: row['speaker'] in TRUMP_ALIASES, axis=1)
  ]


In [10]:
trump_quotes.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
15,2019-02-15-009152,Asylum seekers in Mexico face a heightened ris...,President Donald Trump,[Q22686],2019-02-15 03:20:00,5,"[[President Donald Trump, 0.7752], [None, 0.22...",[http://www.abajournal.com/news/article/lawsui...,E
202,2019-05-13-016076,"Don't forget they're a member of NATO, and a v...",President Donald Trump,[Q22686],2019-05-13 00:00:00,68,"[[President Donald Trump, 0.6683], [None, 0.23...",[http://kazu.org/post/trump-greets-hungarys-ha...,E
244,2019-08-25-016871,I think it's a positive. Other people agree wi...,President Donald Trump,[Q22686],2019-08-25 23:19:22,7,"[[President Donald Trump, 0.6994], [None, 0.15...",[http://www.nytimes.com/2019/08/25/world/europ...,E
254,2019-08-31-024043,"In other words, they're running badly and they...",President Donald Trump,[Q22686],2019-08-31 00:21:27,2,"[[President Donald Trump, 0.7363], [None, 0.26...",[http://www.nytimes.com/2019/08/30/us/politics...,E
255,2019-05-09-051472,"In typical fashion, as soon as Trump Park was ...",President Trump,[Q22686],2019-05-09 09:25:21,1,"[[President Trump, 0.6722], [None, 0.3037], [D...",[https://www.nytimes.com/2019/05/09/nyregion/n...,E


In [30]:
def get_mentions(quotes_df, keyword):
  keyword = keyword.lower()
  mentions_bool = quotes_df.apply(
      lambda row: keyword in row['quotation'].lower(), axis=1
      )
  return quotes_df[mentions_bool]

In [31]:
biden_mentions = get_mentions(trump_quotes, 'biden')
biden_mentions.head()

NameError: ignored

In [59]:
biden_mentions.loc[37913, 'quotation']

"Biden was never very smart. He was a terrible student. His gaffes are unbelievable. When I say something that you might think is a gaffe, it's on purpose; it's not a gaffe. When Biden say something dumb, it's because he's dumb."

In [56]:
# !pip install aspect_based_sentiment_analysis
# import aspect_based_sentiment_analysis as absa
nlp = absa.load()

In [60]:
quotation_id = 37913
text = (biden_mentions.loc[quotation_id, 'quotation'].lower())

completed_task = nlp(text, aspects=['biden'])

In [61]:
biden_result = completed_task.examples

print(absa.summary(biden_result[0]))

Sentiment.negative for "biden"
Scores (neutral/negative/positive): [0.002 0.989 0.008]
None


In [67]:
dir(biden_result[0])
biden_result[0].scores

[0.0024589438, 0.9894963, 0.008044804]

In [90]:
SENTIMENT_INDEXING = {'neutral': 0, 'negative': 1, 'positive': 2}

def add_sentiment_columns(df_original, keyword):
  df = df_original.copy()
  for sentiment_key in SENTIMENT_INDEXING:
    df[sentiment_key] = 0

  for ind, row in df.iterrows():
    text = row['quotation'].lower()
    completed_task = nlp(text, aspects=[keyword])
    sentiment_scores = completed_task.examples[0].scores 

    for sentiment_key, sentiment_ind in SENTIMENT_INDEXING.items():
      df.loc[ind, sentiment_key] = sentiment_scores[sentiment_ind]

  return df

In [94]:
biden_sentiment = add_sentiment_columns(biden_mentions, 'biden')
biden_sentiment 

TypeError: ignored

In [97]:
biden_sentiment.sort_values('negative', ascending=False, inplace=True)

In [102]:
for ind, row in biden_sentiment.iterrows():
  print(f"\n\nNEGATIVE SCORE: {row['negative']}")
  print(f"POSITIVE SCORE: {row['positive']}")
  print(f"NEUTRAL SCORE: {row['neutral']}")

  print(row['quotation'])



NEGATIVE SCORE: 0.9939144253730774
POSITIVE SCORE: 0.003951251041144133
NEUTRAL SCORE: 0.0021344113629311323
Joe Biden was a disaster,


NEGATIVE SCORE: 0.9912710189819336
POSITIVE SCORE: 0.007382732350379229
NEUTRAL SCORE: 0.001346296863630414
What Biden did is a disgrace. What his son did is a disgrace,


NEGATIVE SCORE: 0.9894962906837463
POSITIVE SCORE: 0.008044804446399212
NEUTRAL SCORE: 0.0024589437525719404
Biden was never very smart. He was a terrible student. His gaffes are unbelievable. When I say something that you might think is a gaffe, it's on purpose; it's not a gaffe. When Biden say something dumb, it's because he's dumb.


NEGATIVE SCORE: 0.9756238460540771
POSITIVE SCORE: 0.007405432872474194
NEUTRAL SCORE: 0.016970712691545486
that Biden stopped the prosecution and a lot of people want to find out about that.


NEGATIVE SCORE: 0.9736927151679993
POSITIVE SCORE: 0.004033033270388842
NEUTRAL SCORE: 0.022274235263466835
I don't know what the hell happened to Biden - t

In [105]:
np.mean(quotes_19_sample['speaker'] != 'None')

0.6357967878878411

In [124]:
threshold = 0.2
none_speakers = quotes_19_sample[quotes_19_sample['speaker'] == 'None']
np.mean(none_speakers['probas'].apply(lambda x: float(x[0][1]) - float(x[1][1])) > threshold)

0.8520150300336059

## Wikidata Python API

In [45]:
!pip install Wikidata



### Genders

In [None]:
genders = speaker_attributes['gender'].to_frame().apply(
    lambda x: 'Q6581097' if x is None or x[0] is None else x[0][0], axis=1
    ).unique()


In [52]:
# genders = [
#        'Q6581097', 'Q6581072', 'Q1052281', 'Q2449503',
#        'Q48270', 'Q1097630', 'Q12964198', 'Q189125', 'Q15145779',
#        'Q301702', 'Q179294', 'Q27679766', 'Q18116794', 'Q859614',
#        'Q44148', 'Q1289754', 'Q106299064', 'Q27679684', 'Q15145778',
#        'Q52261234', 'Q207959', 'Q43445', 'Q505371', 'Q7130936',
#        'Q96000630', 'Q1984232', 'Q93954933', 'Q746411', 'Q48279',
#        'Q3177577', 'Q1775415', 'Q6636'
#        ]
for gender in genders:
  print(client.get(gender, load=True).label)



male
female
transgender female
transgender male
non-binary
intersex
genderqueer
transgender person
cisgender female
two-spirit
eunuch
transmasculine
genderfluid
bigender
male organism
neutrois
Erkek
transfeminine
cisgender male
neutral sex
androgyny
female organism
agender
pangender
X-gender
shemale
demiboy
kathoey
third gender
muxe
feminine
homosexuality


### Occupations

In [79]:
def return_string_from_arr(x):
  if x is None or x[0] is None:
    return '';
  sol = ''
  for i in range(x[0].shape[0]):
    sol += x[0][i]
    sol += ','
  return sol[0:-1]

occupations_series = speaker_attributes['occupation'].to_frame().apply(
    return_string_from_arr, axis=1
    )

In [88]:
occupation_ids = occupations_series.unique()

for ids in occupation_ids:
  if ids == '':
    continue
  for id in ids.split(','):
    if id not in occupations:
      try:
        # Multilngual to basic string
        occupations[id] = str(client.get(id, load=True).label)
      except Exception:
        print(id)
      

Q57557390
Q102046591
Q98384826
Q105645755
Q57557390
Q99753484
Q57557390
Q56411328
Q96143085
Q96144081


In [90]:
occupation_backup = occupations.copy()

In [95]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [106]:
import json

with open('/content/drive/MyDrive/occupations.json', 'w') as f:
    json.dump(occupations_single_ling, f)

### Religions

In [70]:
religion_series = speaker_attributes['religion'].to_frame().apply(
    return_string_from_arr, axis=1
    )

religions = {}

religion_ids = religion_series.unique()
for ids in religion_ids:
  if ids == '':
    continue
  for id in ids.split(','):
    if id not in religions:
      religions[id] = str(client.get(id, load=True).label)

with open('/content/drive/MyDrive/religions.json', 'w') as f:
    json.dump(religions, f)    

0            Q82955
1           Q214917
2            Q36180
3            Q82955
4          Q1028181
             ...   
9055976      Q82955
9055977      Q82955
9055978            
9055979            
9055980      Q82955
Length: 9055981, dtype: object

### Nationality

In [None]:
nationality_series = speaker_attributes['nationality'].to_frame().apply(
    return_string_from_arr, axis=1
    )

nationalities = {}

nationality_ids = n_series.unique()
for ids in nationality_ids:
  if ids == '':
    continue
  for id in ids.split(','):
    if id not in religions:
      religions[id] = str(client.get(id, load=True).label)

with open('religions.json', 'w') as f:
    json.dump(religions, f)    