# Setup

In [12]:
PATH_ROOT = 'data'
PATH_PARQUET = PATH_ROOT + '/project_datasets'
PATH_QUOTEBANK = PATH_ROOT + '/Quotebank'
PATH_TO_QUOTES = PATH_QUOTEBANK + '/quotes-{year}.json.bz2'

In [13]:
PATH_OUTPUT = 'output'

In [14]:
!pip install pyarrow



In [15]:
import pandas as pd
import numpy as np
import seaborn as sns

# Loading Data

## Reading wikidata labels

In [16]:
df_wikidata_labels = pd.read_csv(PATH_PARQUET + '/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

## Reading speakers parquet file

In [17]:
df_speakers = pd.read_parquet(PATH_PARQUET + '/speaker_attributes.parquet')
df_speakers.set_index(keys='id', inplace=True)

# Handling data

The quotes dataset is too big to process in memory at once. Here we define a method that generates a sample of speakers from the wikidata dump, along with attributes that interest us, that describe these speakers.
Then we get their corresponding quotes from the quotebank.

## Replacing speaker attributes wikidata qids with their labels

In [None]:
# TODO

## Generating samples

In [18]:
def generate_quotes_sample(number_of_samples=10000, year=2020,
                           quotes_columns=['quoteID', 'quotation', 'speaker', 'qids'],
                           speakers_columns=['date_of_birth]):
    '''
    Generate a sample of speakers with their attributes to their quotes.
    
    Some quotes have multiple qids for the speaker (for example multiple speakers with the same name).
    We use pandas explode to treat quotes with multiple qids as a separate quote by each of the speakers.
    '''
    speakers_sample = df_parquet.sample(n=number_of_samples)[speakers_columns]
    
    merged_chunks = []

    with pd.read_json(path_or_buf=PATH_TO_QUOTES.format(year=year), compression='bz2', lines=True, chunksize=500000) as df_reader:
        for chunk in df_reader:
            # filter the columns
            chunk = chunk[quotes_columns]
            # TODO: filter quotes with None speaker?

            # Some quote have multiple speaker qids. need to explode that and treat them each as a separate quote
            chunk = chunk.explode('qids')
            merged_chunks.append(chunk.merge(right=speakers_sample, right_index=True, left_on='qids'))

    sample = pd.concat(merged_chunks, ignore_index=True)
    return sample

In [19]:
# We choose the column features we are interested in 
quotes_filter_columns = ['quoteID', 'quotation', 'speaker', 'qids']
speaker_attributes_filter_columns = ['date_of_birth', 'gender', 'party', 'religion']

sample = generate_quotes_sample(quotes_columns=quotes_filter_columns, speakers_columns=speaker_attributes_filter_columns)
sample

Unnamed: 0,quoteID,quotation,speaker,qids,date_of_birth,gender,party,religion
0,2020-03-26-054930,That is being done in other jurisdictions thro...,Bruce Anderson,Q56253811,[+1998-09-23T00:00:00Z],[Q6581097],,
1,2020-03-25-007352,At this point in time we don't have any eviden...,Bruce Anderson,Q56253811,[+1998-09-23T00:00:00Z],[Q6581097],,
2,2020-03-09-071113,We have not yet had any cases of COVID-19 that...,Bruce Anderson,Q56253811,[+1998-09-23T00:00:00Z],[Q6581097],,
3,2020-04-01-041759,No one should assume that because they don't l...,Bruce Anderson,Q56253811,[+1998-09-23T00:00:00Z],[Q6581097],,
4,2020-02-05-095650,They are going to be well. We just need to ens...,Bruce Anderson,Q56253811,[+1998-09-23T00:00:00Z],[Q6581097],,
...,...,...,...,...,...,...,...,...
6943,2020-01-11-009004,"First, buy a good piece of meat,",Marcus Wareing,Q6758522,[+1970-06-29T00:00:00Z],[Q6581097],,
6944,2020-01-10-044528,"It just popped out, babes, casual,",Johannes Radebe,Q56044093,[+1987-04-27T00:00:00Z],[Q6581097],,
6945,2020-01-24-001828,a missed chance in the fight against antisemit...,Sabine Müller,Q50225700,,[Q6581072],,
6946,2020-03-11-030202,If you're cancelling in Tokyo because the epid...,Neal Pilson,Q6984170,[+1940-04-18T00:00:00Z],[Q6581097],,


In [21]:
# Save to csv
sample.to_csv(path_or_buf=PATH_OUTPUT + '/speakers_quotes_1000_samples.csv', index=False)

## Analyzing the sample

**Who has the most quotes?**

In [41]:
grouped_speakers = sample.groupby(by=['qids', 'speaker'])['quotation'].agg(['count'])
grouped_speakers.sort_values(by='count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
qids,speaker,Unnamed: 2_level_1
Q4961017,Brendan Murphy,1522
Q6397512,Kevin Smith,488
Q56253811,Bruce Anderson,393
Q558189,Kapil Dev,248
Q24577530,Mark Thompson,240
...,...,...
Q45442459,Shen Yan,1
Q56809258,Shahrukh Ali,1
Q16466635,Emil Chynn,1
Q45502627,Huang Cheng,1


# Extracting topics from quotes (2020) 

When processing by chunks, do the cleaning and extract topics on the quotes directly