# Setup

In [1]:
PATH_ROOT = 'data'
PATH_PARQUET = PATH_ROOT + '/project_datasets'
PATH_QUOTEBANK = PATH_ROOT + '/Quotebank'
PATH_TO_QUOTES = PATH_QUOTEBANK + '/quotes-{year}.json.bz2'

In [2]:
PATH_OUTPUT = 'output'

In [3]:
!pip install pyarrow



In [4]:
import pandas as pd
import numpy as np
import seaborn as sns

# 1. Loading Data

## Reading wikidata labels

In [5]:
df_wikidata_labels = pd.read_csv(PATH_PARQUET + '/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

## Reading speakers parquet file

In [6]:
df_speakers = pd.read_parquet(PATH_PARQUET + '/speaker_attributes.parquet')
df_speakers.set_index(keys='id', inplace=True)

# 2. Cleaning & handling data

The quotes dataset is too big to process in memory at once. Here we define a method that generates a sample of speakers from the wikidata dump, along with attributes that interest us, that describe these speakers.
Then we get their corresponding quotes from the quotebank.

## Generating samples

In [7]:
def generate_quotes_sample(number_of_samples=10000, year=2020,
                           quotes_columns=['quoteID', 'quotation', 'speaker', 'qids'],
                           speakers_columns=['date_of_birth']):
    '''
    Generate a sample of speakers with their attributes to their quotes.
    
    Some quotes have multiple qids for the speaker (for example multiple speakers with the same name).
    We use pandas explode to treat quotes with multiple qids as a separate quote by each of the speakers.
    '''
    speakers_sample = df_speakers.sample(n=number_of_samples)[speakers_columns]
    
    merged_chunks = []

    with pd.read_json(path_or_buf=PATH_TO_QUOTES.format(year=year), compression='bz2', lines=True, chunksize=500000) as df_reader:
        for chunk in df_reader:
            # filter the columns
            chunk = chunk[quotes_columns]
            # TODO: filter quotes with None speaker?

            # Some quote have multiple speaker qids. need to explode that and treat them each as a separate quote
            chunk = chunk.explode('qids')
            merged_chunks.append(chunk.merge(right=speakers_sample, right_index=True, left_on='qids'))

    sample = pd.concat(merged_chunks, ignore_index=True)
    return sample

In [8]:
# We choose the column features we are interested in 
quotes_filter_columns = ['quoteID', 'quotation', 'speaker', 'qids']
speaker_attributes_filter_columns = ['date_of_birth', 'gender', 'party', 'religion']

sample = generate_quotes_sample(quotes_columns=quotes_filter_columns, speakers_columns=speaker_attributes_filter_columns)
sample

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion
0,2020-01-23-004492,"Although not a lubricant issue, tire wear is e...",John Burke,Q15451851,[+1922-03-08T00:00:00Z],[Q6581097],,
1,2020-01-07-064108,The cost of getting new goodies is far too high.,John Burke,Q15451851,[+1922-03-08T00:00:00Z],[Q6581097],,
2,2020-01-15-102577,We have just seen the Garden of Remembrance an...,John Burke,Q15451851,[+1922-03-08T00:00:00Z],[Q6581097],,
3,2020-02-01-011393,good bottle of French wine,John Burke,Q15451851,[+1922-03-08T00:00:00Z],[Q6581097],,
4,2020-03-01-014423,I thought we had a pretty good following but t...,John Burke,Q15451851,[+1922-03-08T00:00:00Z],[Q6581097],,
...,...,...,...,...,...,...,...,...
8030,2020-01-07-030083,I usually don't get caught up with things like...,Elizabeth Powell,Q5363374,,[Q6581072],,
8031,2020-01-06-082102,We've lost them all but we keep getting better...,Michael Fennelly,Q6830262,[+1949-04-04T00:00:00Z],[Q6581097],,
8032,2020-02-18-103362,"We're building a picture that feels right to us,",M. Ward,Q201514,[+1973-10-04T00:00:00Z],[Q6581097],,
8033,2020-01-29-093841,"The vehicle was parked in Mollision Street, Be...",Scott Andrews,Q23542620,[+1994-06-30T00:00:00Z],[Q6581097],,


In [9]:
# Save to csv
sample.to_csv(path_or_buf=PATH_OUTPUT + '/speakers_quotes_1000_samples.csv', index=False)

## Analyzing the sample

**Who has the most quotes?**

In [10]:
grouped_speakers = sample.groupby(by=['qids', 'speaker'])['quotation'].agg(['count'])
grouped_speakers.sort_values(by='count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
qids,speaker,Unnamed: 2_level_1
Q6255170,John Roberts,679
Q47138769,Tua Tagovailoa,363
Q16235144,Joe Walsh,349
Q5230773,David Anderson,331
Q231648,Gabrielle Union,331
...,...,...
Q42665815,Richard L. Fox,1
Q56418995,Ruth Kirk,1
Q171511,Peter Gethin,1
Q56087342,Jordan Cowan,1


## Replacing speaker attributes wikidata qids with their labels

Since speaker attributes are only described by their qids, we need replace them by their labels to get their actual meaning. For that
we're going to need the wikidata, and the following code shows an example of doing it.

In [11]:
from datetime import datetime

def create_features_set(QIDs, attributes_name):
    '''
    Creates the features dataset.
    :param QIDs: Pandas Series with the Wikidata ids of each speaker.
    :param attributes_name: the list of feature attributes that we're going to use for the regression
    :return: Pandas DataFrame with the attributes used as variables (one per column).
    '''
    speaker_attr = df_speakers[df_speakers.index.isin(QIDs.tolist())]

    attributes = {}
    for attribute_name in attributes_name:
        if (attribute_name != 'date_of_birth'):

            attr_qids = speaker_attr[attribute_name].apply(lambda x: None if type(x) is type(None) else x[-1]) # TODO: HOW TO CHOOSE THE QIDS (PER ATTRIBUTE) WHEN THERE ARE MULTIPLES (E.G. WHEN HAVING MULTIPLE POLITICAL PARTIES)?
            attr = df_wikidata_labels['Label'].reindex(attr_qids)

            if (attribute_name == 'academic_degree'):
                attr.fillna(value='High school', inplace=True)

        else:
            attr = speaker_attr[attribute_name].apply(lambda x: datetime.strptime(x[0], '+%Y-%m-%dT%H:%M:%S%z').year)

        attributes[attribute_name] = attr.tolist()

    feature_set = (pd.DataFrame(attributes)).dropna()
    obj_columns = feature_set.select_dtypes(['object']).columns
    feature_set[obj_columns] = feature_set[obj_columns].astype('category')

    return feature_set


# TODO: REMOVE (USED FOR TESTING)
qids = pd.Series(['Q38111', 'Q17714', 'Q22686'])
attr_names = ['date_of_birth', 'nationality', 'gender', 'party', 'academic_degree', 'religion']

df3 = create_features_set(qids, attr_names)
#df3.to_pickle('dataX.pkl')
display(df3.head())

cat_columns = df3.select_dtypes(['category']).columns
df3[cat_columns] = df3[cat_columns].apply(lambda x: x.cat.codes) 
x = df3.to_numpy()
display(x)


Unnamed: 0,date_of_birth,nationality,gender,party,academic_degree,religion
1,1946,United States of America,male,Republican Party,Bachelor of Science,Presbyterianism
2,1974,United States of America,male,Democratic Party,High school,Roman Catholic


array([[1946,    0,    0,    1,    0,    0],
       [1974,    0,    0,    0,    1,    1]], dtype=int64)

# 2. Topic labeling

Zero shot classficiation using the transformers library and BART model

In [12]:
#!pip install pytorch
#!pip install tensorflow
#!pip install transformers

In [13]:
import transformers
#import json 
import os
import torch

In [14]:
# Loading the model
classifier = transformers.pipeline('zero-shot-classification',model='xlnet-base-cased')

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [15]:
# Define the attributes that belong to a certain topic and are used for classification
labels_climate = ['climate','pollution','waste','dirty','ozone','warming','temperature']
hypothesis_template = 'This text is about {}.' # what the model should do

In [16]:
def get_labels(data,labels,score_thresh):
    N = len(data)
    results = []
    for i in range(N):
        quote = data[i]['quotation']
        prediction = classifier(quote, labels, hypothesis_template=hypothesis_template, multi_label=True)
        score = np.mean(prediction['scores'])
        if score > score_thresh:
            results.append(score,1) # 1 for that the quote deals with the defined topic
        else:
            results.append(score,0) # 0 for that the quote does not deal with the topic
        
    return results


# 3. Logistic Regression Pipeline

In [17]:
# TODO

# 4. Data Analysis

In [18]:
# TODO

# 5. Data Visualization & Results

In [19]:
# TODO

# 6. Conludions and Interpretation

In [20]:
# TODO