# Setup

In [1]:
PATH_ROOT = 'data'
PATH_PARQUET = PATH_ROOT + '/project_datasets'
PATH_QUOTEBANK = PATH_ROOT + '/Quotebank'
PATH_TO_QUOTES = PATH_QUOTEBANK + '/quotes-{year}.json.bz2'

In [2]:
PATH_OUTPUT = 'output'

In [3]:
!pip install pyarrow



In [4]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models

# 1. Loading Data

## Reading wikidata labels

In [5]:
df_wikidata_labels = pd.read_csv(PATH_PARQUET + '/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')
df_wikidata_labels.head()

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q31,Belgium,country in western Europe
Q45,Portugal,country in southwestern Europe
Q75,Internet,global system of connected computer networks
Q148,People's Republic of China,sovereign state in East Asia
Q155,Brazil,country in South America


## Reading speakers parquet file

In [18]:
df_speakers = pd.read_parquet(PATH_PARQUET + '/speaker_attributes.parquet')
df_speakers.set_index(keys='id', inplace=True)
df_speakers.head()

Unnamed: 0_level_0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Q23,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,George Washington,"[Q698073, Q697949]",item,[Q682443]
Q42,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Douglas Adams,,item,
Q1868,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Paul Otlet,,item,
Q207,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
Q297,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Diego Velázquez,,item,


In [21]:
df_speakers_cleaned = df_speakers.dropna()
print('Total number of speakers : ', len(df_speakers))
print('Total number of speakers with filled attributes : ', len(df_speakers_cleaned))
print('{} rows dropped'.format(len(df_speakers) - len(df_speakers_cleaned)))

Total number of speakers :  9055981
Total number of speakers with filled attributes :  3
9055978 rows dropped


As we can see that dropping all speakers with undefined attributes is unrealistic to work with, since we're only left with 3 speakers with all their attributes defined to work with. We're going to have to some fine-grained filtering of undefined values column wise instead of on whole rows (all attributes).

# 2. Cleaning & handling data

The quotes dataset is too big to process in memory at once. Here we define a method that generates a sample of speakers from the wikidata dump, along with attributes that interest us, that describe these speakers.
Then we get their corresponding quotes from the quotebank.

## 2.1 Generating samples (of speakers) and merge them with their quotations

In [9]:
def generate_quotes_sample(number_of_samples=10000, year=2020,
                           quotes_columns=['quoteID', 'quotation', 'speaker', 'qids', 'party'],
                           speakers_columns=['date_of_birth']):
    '''
    Generate a sample of speakers with their attributes to their quotes.
    
    Some quotes have multiple qids for the speaker (for example multiple speakers with the same name).
    We use pandas explode to treat quotes with multiple qids as a separate quote by each of the speakers.
    '''
    speakers_sample = df_speakers.sample(n=number_of_samples)[speakers_columns]
    
    merged_chunks = []

    with pd.read_json(path_or_buf=PATH_TO_QUOTES.format(year=year), compression='bz2', lines=True, chunksize=500000) as df_reader:
        for chunk in df_reader:
            # filter the columns
            chunk = chunk[quotes_columns]
            # TODO: filter quotes with None speaker?

            # Some quote have multiple speaker qids associated to the same name. need to explode that and treat them each as a separate quote
            chunk = chunk.explode('qids')
            merged_chunks.append(chunk.merge(right=speakers_sample, right_index=True, left_on='qids'))

    sample = pd.concat(merged_chunks, ignore_index=True)
    return sample

In [10]:
quotes_filter_columns = ['quoteID', 'quotation', 'speaker', 'qids']

# We choose the column features we're interested in 
speaker_attributes_filter_columns = ['date_of_birth', 'gender', 'party', 'religion', 'occupation']
n_samples = 100000

sample = generate_quotes_sample(number_of_samples=n_samples, quotes_columns=quotes_filter_columns, speakers_columns=speaker_attributes_filter_columns)
sample

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion,occupation
0,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,[+1978-09-14T00:00:00Z],[Q6581097],[Q29468],,"[Q82955, Q189290, Q40348, Q36180]"
1,2020-01-14-052069,Lower-income workers also shouldn't have their...,Ron DeSantis,Q3105215,[+1978-09-14T00:00:00Z],[Q6581097],[Q29468],,"[Q82955, Q189290, Q40348, Q36180]"
2,2020-04-14-058701,They were badgering about it and he was saying...,Ron DeSantis,Q3105215,[+1978-09-14T00:00:00Z],[Q6581097],[Q29468],,"[Q82955, Q189290, Q40348, Q36180]"
3,2020-03-26-009255,But what's happened is -- and it's a problem f...,Ron DeSantis,Q3105215,[+1978-09-14T00:00:00Z],[Q6581097],[Q29468],,"[Q82955, Q189290, Q40348, Q36180]"
4,2020-03-26-014826,Florida faces the possibility of exposing its ...,Ron DeSantis,Q3105215,[+1978-09-14T00:00:00Z],[Q6581097],[Q29468],,"[Q82955, Q189290, Q40348, Q36180]"
...,...,...,...,...,...,...,...,...,...
86510,2020-02-06-039469,I think he'll need to be patient a little bit ...,James Tarkowski,Q6144020,[+1992-11-19T00:00:00Z],[Q6581097],,,[Q937857]
86511,2020-01-03-048152,politically incorrect Quiz of the Decade,Rod Liddle,Q7356281,[+1960-04-01T00:00:00Z],[Q6581097],[Q405492],[Q6423963],[Q1930187]
86512,2020-02-03-092399,We can't play Russian roulette with the lives ...,Charlene Lima,Q5074657,[+1953-08-18T00:00:00Z],[Q6581072],[Q29552],,[Q82955]
86513,2020-02-18-023355,"he was like,' I can't get time off.' Yeah [ Ed...",Vickie Guerrero,Q231501,[+1968-04-16T00:00:00Z],[Q6581072],,,"[Q721834, Q13474373, Q578109]"


### Explode column values of sample

Some columns store their values as lists, such as a speaker who has multiple occupations or genders. We need to explode those so that we have a row for each unique value in each column. This presents some inconsistencies, **we don't take into account at what point of his life a speaker may have uttered a certain quotation**. For example, he may have had a different occupation when he said it. 

In [11]:
original_sample_size = len(sample)

for col in speaker_attributes_filter_columns:
    sample = sample.explode(col)
sample.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion,occupation
0,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q82955
0,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q189290
0,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q40348
0,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q36180
1,2020-01-14-052069,Lower-income workers also shouldn't have their...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q82955


In [12]:
print('{} rows exploded'.format(len(sample) - original_sample_size))

80779 rows exploded


In [23]:
# Save to csv (for later retrieval)
sample.to_csv(path_or_buf=PATH_OUTPUT + '/speakers_quotes_example_samples.csv', index=False)

## Retrieve the sample saved in csv (You can continue work from here)

In [24]:
retrieved_sample = pd.read_csv(PATH_OUTPUT + '/speakers_quotes_example_samples.csv')
retrieved_sample

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion,occupation
0,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q82955
1,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q189290
2,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q40348
3,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q36180
4,2020-01-14-052069,Lower-income workers also shouldn't have their...,Ron DeSantis,Q3105215,+1978-09-14T00:00:00Z,Q6581097,Q29468,,Q82955
...,...,...,...,...,...,...,...,...,...
167289,2020-02-03-092399,We can't play Russian roulette with the lives ...,Charlene Lima,Q5074657,+1953-08-18T00:00:00Z,Q6581072,Q29552,,Q82955
167290,2020-02-18-023355,"he was like,' I can't get time off.' Yeah [ Ed...",Vickie Guerrero,Q231501,+1968-04-16T00:00:00Z,Q6581072,,,Q721834
167291,2020-02-18-023355,"he was like,' I can't get time off.' Yeah [ Ed...",Vickie Guerrero,Q231501,+1968-04-16T00:00:00Z,Q6581072,,,Q13474373
167292,2020-02-18-023355,"he was like,' I can't get time off.' Yeah [ Ed...",Vickie Guerrero,Q231501,+1968-04-16T00:00:00Z,Q6581072,,,Q578109


## Merge our speaker feature columns (qids) with their actual labels from the speakers attributes parquet file

In [25]:
merged_labels = retrieved_sample.copy()
for col in speaker_attributes_filter_columns:
    merged_labels = merged_labels.merge(df_wikidata_labels, left_on=col, right_index=True, how='left')
    merged_labels[col] = merged_labels['Label']
    merged_labels.drop(columns=['Label', 'Description'], inplace=True)

In [26]:
merged_labels

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion,occupation
0,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,,male,Republican Party,,politician
1,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,,male,Republican Party,,military officer
2,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,,male,Republican Party,,lawyer
3,2020-01-15-008152,Assuring a legal workforce through E-Verify wi...,Ron DeSantis,Q3105215,,male,Republican Party,,writer
4,2020-01-14-052069,Lower-income workers also shouldn't have their...,Ron DeSantis,Q3105215,,male,Republican Party,,politician
...,...,...,...,...,...,...,...,...,...
167289,2020-02-03-092399,We can't play Russian roulette with the lives ...,Charlene Lima,Q5074657,,female,Democratic Party,,politician
167290,2020-02-18-023355,"he was like,' I can't get time off.' Yeah [ Ed...",Vickie Guerrero,Q231501,,female,,,manager
167291,2020-02-18-023355,"he was like,' I can't get time off.' Yeah [ Ed...",Vickie Guerrero,Q231501,,female,,,professional wrestler
167292,2020-02-18-023355,"he was like,' I can't get time off.' Yeah [ Ed...",Vickie Guerrero,Q231501,,female,,,television producer


### Exploring data on occupation of speaker

In [27]:
print('We have {} different occupations in our sample'.format(len(merged_labels.occupation.unique())))

We have 523 different occupations in our sample


Let's try to keep only those that relate to politics for example. For that we have to relate the profession to **politics**. 

In [36]:
#Importing required modules
from nltk.stem.porter import PorterStemmer
 
#Creating the class object
stemmer = PorterStemmer()
 
#words to stem
words = ['politician', 'politics', 'policy']
 
#Stemming the words
for word in words:
    print(word+' -> '+ stemmer.stem(word))

politician -> politician
politics -> polit
policy -> polici


### Analyzing the sample

**Who has the most quotes?**

In [15]:
grouped_speakers = retrieved_sample.groupby(by=['qids', 'speaker'])['quotation'].agg(['count'])
grouped_speakers.sort_values(by='count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
qids,speaker,Unnamed: 2_level_1
Q15993658,David Williams,716
Q22005681,Chris Jones,338
Q51397308,David Jones,323
Q30600781,Jack Ross,258
Q16186315,Brian Kelly,230
...,...,...
Q6184125,JERRY MOORE,1
Q6177769,Jenna Morasca,1
Q45443285,Zhang he,1
Q6161021,Jarrett J. Krosoczka,1


## Replacing speaker attributes wikidata qids with their labels

Since speaker attributes are only described by their qids, we need replace them by their labels to get their actual meaning. For that
we're going to need the wikidata, and the following code shows an example of doing it.

In [11]:
from datetime import datetime

def create_features_set(QIDs, attributes_name):
    '''
    Creates the features dataset.
    :param QIDs: Pandas Series with the Wikidata ids of each speaker.
    :param attributes_name: the list of feature attributes that we're going to use for the regression
    :return: Pandas DataFrame with the attributes used as variables (one per column).
    '''
    speaker_attr = df_speakers[df_speakers.index.isin(QIDs.tolist())]

    attributes = {}
    for attribute_name in attributes_name:
        if (attribute_name != 'date_of_birth'):

            attr_qids = speaker_attr[attribute_name].apply(lambda x: None if type(x) is type(None) else x[-1]) # TODO: HOW TO CHOOSE THE QIDS (PER ATTRIBUTE) WHEN THERE ARE MULTIPLES (E.G. WHEN HAVING MULTIPLE POLITICAL PARTIES)?
            attr = df_wikidata_labels['Label'].reindex(attr_qids)

            if (attribute_name == 'academic_degree'):
                attr.fillna(value='High school', inplace=True)

        else:
            attr = speaker_attr[attribute_name].apply(lambda x: datetime.strptime(x[0], '+%Y-%m-%dT%H:%M:%S%z').year)

        attributes[attribute_name] = attr.tolist()

    feature_set = (pd.DataFrame(attributes)).dropna()
    obj_columns = feature_set.select_dtypes(['object']).columns
    feature_set[obj_columns] = feature_set[obj_columns].astype('category')

    return feature_set


# TODO: REMOVE (USED FOR TESTING)
qids = pd.Series(['Q38111', 'Q17714', 'Q22686'])
attr_names = ['date_of_birth', 'nationality', 'gender', 'party', 'academic_degree', 'religion']

df3 = create_features_set(qids, attr_names)
#df3.to_pickle('dataX.pkl')
display(df3.head())

cat_columns = df3.select_dtypes(['category']).columns
df3[cat_columns] = df3[cat_columns].apply(lambda x: x.cat.codes) 
x = df3.to_numpy()
display(x)


Unnamed: 0,date_of_birth,nationality,gender,party,academic_degree,religion
1,1946,United States of America,male,Republican Party,Bachelor of Science,Presbyterianism
2,1974,United States of America,male,Democratic Party,High school,Roman Catholic


array([[1946,    0,    0,    1,    0,    0],
       [1974,    0,    0,    0,    1,    1]], dtype=int64)

# 2. Topic labeling

Zero shot classficiation using the transformers library and BART model

In [12]:
#!pip install pytorch
#!pip install tensorflow
#!pip install transformers

In [13]:
import transformers
#import json 
import os
import torch

In [14]:
# Loading the model
classifier = transformers.pipeline('zero-shot-classification',model='xlnet-base-cased')

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [15]:
# Define the attributes that belong to a certain topic and are used for classification
labels_climate = ['climate','pollution','waste','dirty','ozone','warming','temperature']
hypothesis_template = 'This text is about {}.' # what the model should do

In [16]:
def get_labels(data,labels,score_thresh):
    N = len(data)
    results = []
    for i in range(N):
        quote = data[i]['quotation']
        prediction = classifier(quote, labels, hypothesis_template=hypothesis_template, multi_label=True)
        score = np.mean(prediction['scores'])
        if score > score_thresh:
            results.append(score,1) # 1 for that the quote deals with the defined topic
        else:
            results.append(score,0) # 0 for that the quote does not deal with the topic
        
    return results


# 3. Logistic Regression Pipeline

In [17]:
# TODO

# 4. Data Analysis

In [18]:
# TODO

# 5. Data Visualization & Results

In [19]:
# TODO

# 6. Conclusions and Interpretation

In [20]:
# TODO