# Setup

In [1]:
PATH_ROOT = 'data'
PATH_PARQUET = PATH_ROOT + '/project_datasets'
PATH_QUOTEBANK = PATH_ROOT + '/Quotebank'
PATH_TO_QUOTES = PATH_QUOTEBANK + '/quotes-{year}.json.bz2'

In [2]:
PATH_OUTPUT = 'output'

In [3]:
!pip install pyarrow



In [29]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models

import matplotlib.pyplot as plt
%matplotlib inline  

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Loading Data

## Reading wikidata labels

In [5]:
df_wikidata_labels = pd.read_csv(PATH_PARQUET + '/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')
df_wikidata_labels.head()

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q31,Belgium,country in western Europe
Q45,Portugal,country in southwestern Europe
Q75,Internet,global system of connected computer networks
Q148,People's Republic of China,sovereign state in East Asia
Q155,Brazil,country in South America


## Reading speakers parquet file

In [6]:
df_speakers = pd.read_parquet(PATH_PARQUET + '/speaker_attributes.parquet')
df_speakers.set_index(keys='id', inplace=True)
df_speakers.head()

Unnamed: 0_level_0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Q23,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,George Washington,"[Q698073, Q697949]",item,[Q682443]
Q42,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Douglas Adams,,item,
Q1868,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Paul Otlet,,item,
Q207,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
Q297,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Diego Velázquez,,item,


In [7]:
df_speakers_cleaned = df_speakers.dropna()
print('Total number of speakers : ', len(df_speakers))
print('Total number of speakers with filled attributes : ', len(df_speakers_cleaned))
print('{} rows dropped'.format(len(df_speakers) - len(df_speakers_cleaned)))

Total number of speakers :  9055981
Total number of speakers with filled attributes :  3
9055978 rows dropped


As we can see that dropping all speakers with undefined attributes is unrealistic to work with, since we're only left with 3 speakers with all their attributes defined to work with. We're going to have to some fine-grained filtering of undefined values column wise instead of on whole rows (all attributes).

# 2. Cleaning & handling data

The quotes dataset is too big to process in memory at once. Here we define a method that generates a sample of speakers from the wikidata dump, along with attributes that interest us, that describe these speakers.
Then we get their corresponding quotes from the quotebank.

## 2.1 Generating samples (of speakers) and merge them with their quotations

In [8]:
def generate_quotes_sample(number_of_samples=10000, year=2020,
                           quotes_columns=['quoteID', 'quotation', 'speaker', 'qids', 'party'],
                           speakers_columns=['date_of_birth']):
    '''
    Generate a sample of speakers with their attributes to their quotes.
    
    Some quotes have multiple qids for the speaker (for example multiple speakers with the same name).
    We use pandas explode to treat quotes with multiple qids as a separate quote by each of the speakers.
    '''
    speakers_sample = df_speakers.sample(n=number_of_samples)[speakers_columns]
    
    merged_chunks = []

    with pd.read_json(path_or_buf=PATH_TO_QUOTES.format(year=year), compression='bz2', lines=True, chunksize=500000) as df_reader:
        for chunk in df_reader:
            # filter the columns
            chunk = chunk[quotes_columns]
            # TODO: filter quotes with None speaker?

            # Some quote have multiple speaker qids associated to the same name. need to explode that and treat them each as a separate quote
            chunk = chunk.explode('qids')
            merged_chunks.append(chunk.merge(right=speakers_sample, right_index=True, left_on='qids'))

    sample = pd.concat(merged_chunks, ignore_index=True)
    return sample

In [9]:
quotes_filter_columns = ['quoteID', 'quotation', 'speaker', 'qids']

# We choose the column features we're interested in 
speaker_attributes_filter_columns = ['date_of_birth', 'gender', 'party', 'religion', 'occupation']
n_samples = 1000000

sample = generate_quotes_sample(number_of_samples=n_samples, quotes_columns=quotes_filter_columns, speakers_columns=speaker_attributes_filter_columns)
sample

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion,occupation
0,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,Q20684375,[+1984-09-26T00:00:00Z],[Q6581072],,,
1,2020-01-07-048692,My partner was a professional baseball player....,Meghan King Edmonds,Q20684375,[+1984-09-26T00:00:00Z],[Q6581072],,,
2,2020-01-21-031706,"I remember asking why they were so weird, and ...",Meghan King Edmonds,Q20684375,[+1984-09-26T00:00:00Z],[Q6581072],,,
3,2020-02-10-088672,to all the `Nancys in Nebraska. ',Meghan King Edmonds,Q20684375,[+1984-09-26T00:00:00Z],[Q6581072],,,
4,2020-02-20-059272,She's basically replaced Jim in what he would ...,Meghan King Edmonds,Q20684375,[+1984-09-26T00:00:00Z],[Q6581072],,,
...,...,...,...,...,...,...,...,...,...
800543,2020-02-15-056376,"weak, confusing and invalid",Tony Bates,Q7821852,[+1967-04-29T00:00:00Z],[Q6581097],,,[Q43845]
800544,2020-03-24-087490,We're going to get out of this place. I can't ...,Russell Yuen,Q2176962,[+1965-10-30T00:00:00Z],[Q6581097],,,"[Q33999, Q10800557]"
800545,2020-01-09-108973,We're looking at 180 years of neglect and chan...,Colin Walker,Q26464525,[+1962-10-29T00:00:00Z],[Q6581097],,,"[Q13381753, Q13854733]"
800546,2020-04-10-059728,"When I looked at it at around 11 a.m., it was ...",Huang Wen,Q45542512,,[Q6581097],,,


### Explode column values of sample

Some columns store their values as lists, such as a speaker who has multiple occupations or genders. We need to explode those so that we have a row for each unique value in each column. This presents some inconsistencies, **we don't take into account at what point of his life a speaker may have uttered a certain quotation**. For example, he may have had a different occupation when he said it. 

In [10]:
original_sample_size = len(sample)

for col in speaker_attributes_filter_columns:
    sample = sample.explode(col)
sample.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion,occupation
0,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,
1,2020-01-07-048692,My partner was a professional baseball player....,Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,
2,2020-01-21-031706,"I remember asking why they were so weird, and ...",Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,
3,2020-02-10-088672,to all the `Nancys in Nebraska. ',Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,
4,2020-02-20-059272,She's basically replaced Jim in what he would ...,Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,


In [11]:
print('{} rows exploded'.format(len(sample) - original_sample_size))

659162 rows exploded


In [12]:
# Save to csv (for later retrieval)
sample.to_csv(path_or_buf=PATH_OUTPUT + '/speakers_quotes_example_samples.csv', index=False)

## Retrieve the sample saved in csv (You can continue work from here)

In [13]:
retrieved_sample = pd.read_csv(PATH_OUTPUT + '/speakers_quotes_example_samples.csv')
retrieved_sample

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion,occupation
0,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,
1,2020-01-07-048692,My partner was a professional baseball player....,Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,
2,2020-01-21-031706,"I remember asking why they were so weird, and ...",Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,
3,2020-02-10-088672,to all the `Nancys in Nebraska. ',Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,
4,2020-02-20-059272,She's basically replaced Jim in what he would ...,Meghan King Edmonds,Q20684375,+1984-09-26T00:00:00Z,Q6581072,,,
...,...,...,...,...,...,...,...,...,...
1459705,2020-03-24-087490,We're going to get out of this place. I can't ...,Russell Yuen,Q2176962,+1965-10-30T00:00:00Z,Q6581097,,,Q10800557
1459706,2020-01-09-108973,We're looking at 180 years of neglect and chan...,Colin Walker,Q26464525,+1962-10-29T00:00:00Z,Q6581097,,,Q13381753
1459707,2020-01-09-108973,We're looking at 180 years of neglect and chan...,Colin Walker,Q26464525,+1962-10-29T00:00:00Z,Q6581097,,,Q13854733
1459708,2020-04-10-059728,"When I looked at it at around 11 a.m., it was ...",Huang Wen,Q45542512,,Q6581097,,,


## Merge our speaker feature columns (qids) with their actual labels from the speakers attributes parquet file

In [14]:
merged_labels = retrieved_sample.copy()
for col in speaker_attributes_filter_columns:
    merged_labels = merged_labels.merge(df_wikidata_labels, left_on=col, right_index=True, how='left')
    merged_labels[col] = merged_labels['Label']
    merged_labels.drop(columns=['Label', 'Description'], inplace=True)

In [15]:
merged_labels

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion,occupation
0,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,Q20684375,,female,,,
1,2020-01-07-048692,My partner was a professional baseball player....,Meghan King Edmonds,Q20684375,,female,,,
2,2020-01-21-031706,"I remember asking why they were so weird, and ...",Meghan King Edmonds,Q20684375,,female,,,
3,2020-02-10-088672,to all the `Nancys in Nebraska. ',Meghan King Edmonds,Q20684375,,female,,,
4,2020-02-20-059272,She's basically replaced Jim in what he would ...,Meghan King Edmonds,Q20684375,,female,,,
...,...,...,...,...,...,...,...,...,...
1459705,2020-03-24-087490,We're going to get out of this place. I can't ...,Russell Yuen,Q2176962,,male,,,film actor
1459706,2020-01-09-108973,We're looking at 180 years of neglect and chan...,Colin Walker,Q26464525,,male,,,middle-distance runner
1459707,2020-01-09-108973,We're looking at 180 years of neglect and chan...,Colin Walker,Q26464525,,male,,,steeplechase runner
1459708,2020-04-10-059728,"When I looked at it at around 11 a.m., it was ...",Huang Wen,Q45542512,,male,,,


### Exploring data on occupation of speaker

In [16]:
print('We have {} different occupations in our sample'.format(len(merged_labels.occupation.unique())))

We have 1419 different occupations in our sample


Let's try to keep only those that relate to politics for example. For that we have to relate the profession to **politics**. 

In [17]:
#Importing required modules
from nltk.stem.porter import PorterStemmer
 
#Creating the class object
stemmer = PorterStemmer()
 
#words to stem
words = ['politician', 'politics', 'policy']
 
#Stemming the words
for word in words:
    print(word+' -> '+ stemmer.stem(word))

politician -> politician
politics -> polit
policy -> polici


### Analyzing the sample

**Who has the most quotes?**

In [18]:
grouped_speakers = retrieved_sample.groupby(by=['qids', 'speaker'])['quotation'].agg(['count'])
grouped_speakers.sort_values(by='count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
qids,speaker,Unnamed: 2_level_1
Q6294,Hillary Clinton,24822
Q22237,Amy Klobuchar,22362
Q43723,Benjamin Netanyahu,18264
Q55800,Oprah Winfrey,18160
Q432473,Melania Trump,10992
...,...,...
Q45430702,Zhu Jiang,1
Q21427269,Daniel Fernandez,1
Q56425228,David king,1
Q56426163,Jenny Owen,1


## Replacing speaker attributes wikidata qids with their labels

Since speaker attributes are only described by their qids, we need replace them by their labels to get their actual meaning. For that
we're going to need the wikidata, and the following code shows an example of doing it.

In [19]:
from datetime import datetime

def create_features_set(QIDs, attributes_name):
    '''
    Creates the features dataset.
    :param QIDs: Pandas Series with the Wikidata ids of each speaker.
    :param attributes_name: the list of feature attributes that we're going to use for the regression
    :return: Pandas DataFrame with the attributes used as variables (one per column).
    '''
    speaker_attr = df_speakers[df_speakers.index.isin(QIDs.tolist())]

    attributes = {}
    for attribute_name in attributes_name:
        if (attribute_name != 'date_of_birth'):

            attr_qids = speaker_attr[attribute_name].apply(lambda x: None if type(x) is type(None) else x[-1]) # TODO: HOW TO CHOOSE THE QIDS (PER ATTRIBUTE) WHEN THERE ARE MULTIPLES (E.G. WHEN HAVING MULTIPLE POLITICAL PARTIES)?
            attr = df_wikidata_labels['Label'].reindex(attr_qids)

            if (attribute_name == 'academic_degree'):
                attr.fillna(value='High school', inplace=True)

        else:
            attr = speaker_attr[attribute_name].apply(lambda x: datetime.strptime(x[0], '+%Y-%m-%dT%H:%M:%S%z').year)

        attributes[attribute_name] = attr.tolist()

    feature_set = (pd.DataFrame(attributes)).dropna()
    obj_columns = feature_set.select_dtypes(['object']).columns
    feature_set[obj_columns] = feature_set[obj_columns].astype('category')

    return feature_set


# TODO: REMOVE (USED FOR TESTING)
qids = pd.Series(['Q38111', 'Q17714', 'Q22686'])
attr_names = ['date_of_birth', 'nationality', 'gender', 'party', 'academic_degree', 'religion']

df3 = create_features_set(qids, attr_names)
#df3.to_pickle('dataX.pkl')
display(df3.head())

cat_columns = df3.select_dtypes(['category']).columns
df3[cat_columns] = df3[cat_columns].apply(lambda x: x.cat.codes) 
x = df3.to_numpy()
display(x)


Unnamed: 0,date_of_birth,nationality,gender,party,academic_degree,religion
1,1946,United States of America,male,Republican Party,Bachelor of Science,Presbyterianism
2,1974,United States of America,male,Democratic Party,High school,Roman Catholic


array([[1946,    0,    0,    1,    0,    0],
       [1974,    0,    0,    0,    1,    1]], dtype=int64)

# 2. Topic labeling

Zero shot classficiation using the transformers library and BART model

In [20]:
#!pip install pytorch
#!pip install tensorflow
#!pip install transformers

In [21]:
import transformers
#import json 
import os
import torch

In [22]:
# Loading the model
classifier = transformers.pipeline('zero-shot-classification',model='xlnet-base-cased')

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [23]:
# Define the attributes that belong to a certain topic and are used for classification
labels_climate = ['climate','pollution','waste','dirty','ozone','warming','temperature']
hypothesis_template = 'This text is about {}.' # what the model should do

In [24]:
def get_labels(data,labels,score_thresh):
    N = len(data)
    results = []
    for i in range(N):
        quote = data[i]['quotation']
        prediction = classifier(quote, labels, hypothesis_template=hypothesis_template, multi_label=True)
        score = np.mean(prediction['scores'])
        if score > score_thresh:
            results.append(score,1) # 1 for that the quote deals with the defined topic
        else:
            results.append(score,0) # 0 for that the quote does not deal with the topic
        
    return results


# 3. Logistic Regression Pipeline

In [25]:
# TODO

# 4. Data Analysis

In [26]:
# TODO

# 5. Data Visualization & Results

In [27]:
# TODO

# 6. Conclusions and Interpretation

In [28]:
# TODO