In [1]:
# imports
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import bz2

In [2]:
# dataset loading
DATASET_PATH_JSON_BZ2 = "../data/raw/quotes-2020.json.bz2"
DATASET_PATH_JSON = "../data/raw/quotes-2020.json"
DATASET_PATH_PICKLE ="../data/binary/data.pickle"

data = [] 
columns = ['quoteID', 'quotation', 'speaker']

def save_pickle(json_path_bz2, pickle_path):
    with bz2.open(json_path_bz2, 'rb') as s_file:
        count = 0
        for instance in tqdm(s_file):
            count += 1
            instance = json.loads(instance) # loading a sample
            if instance['speaker'] == "None":
                continue
            row = dict((k, instance[k]) for k in columns)
            data.append(row)
            if count % 100000 == 0:
                print(count)
        df = pd.DataFrame(data)
        df.to_pickle(pickle_path)

save = False
if save:
    df = save_pickle(DATASET_PATH_JSON_BZ2, DATASET_PATH_PICKLE)

In [3]:
df = pd.read_pickle(DATASET_PATH_PICKLE)
df.head()

Unnamed: 0,quoteID,quotation,speaker
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick
1,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds
2,2020-01-17-000357,[ The delay ] will have an impact [ on Slough ...,Dexter Smith
3,2020-04-02-000239,[ The scheme ] treats addiction as an illness ...,Barry Coppinger
4,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson


In [4]:
df['speaker'].value_counts()

President Donald Trump    28023
Bernie Sanders            13189
Joe Biden                 12817
President Trump           12415
Andrew Cuomo              11088
                          ...  
Gail Peck                     1
Huy Pham                      1
Prince Johnson                1
Ahmad Zubi                    1
Tjungkara Ken                 1
Name: speaker, Length: 218414, dtype: int64

In [5]:
# load speaker info
PARQUET_PATH = "../data/raw/speaker_attributes.parquet"
df_meta = pd.read_parquet(PARQUET_PATH)
df_meta = df_meta[["aliases", "party", "label"]]
# take the speakers that have an assigned political affiliation
df_meta_notna = df_meta.dropna()
# take the first affiliation only (TODO: this is most likely the best method, think what to do when someone has multiple affiliations)
df_meta_notna["party"] = df_meta_notna["party"].apply(lambda x: x[0])
df_meta_notna.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta_notna["party"] = df_meta_notna["party"].apply(lambda x: x[0])


Unnamed: 0,aliases,party,label
0,"[Washington, President Washington, G. Washingt...",Q327591,George Washington
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",Q29468,George W. Bush
5,"[Augusto Pinochet Ugarte, Augusto José Ramón P...",Q327591,Augusto Pinochet
11,"[Gabriel González Videla, Gabriel Gonzales Vid...",Q1759368,Gabriel Gonzáles Videla
14,"[Mary Louise Streep, Meryl Louise Streep, Stre...",Q29552,Meryl Streep


In [6]:
# generate a mapping from all aliases to one single name
alias_to_label = df_meta.explode("aliases")[["aliases", "label"]]
alias_to_label = alias_to_label.rename(columns = {"aliases": "speaker"})
alias_to_label.head()

Unnamed: 0,speaker,label
0,Washington,George Washington
0,President Washington,George Washington
0,G. Washington,George Washington
0,Father of the United States,George Washington
0,The American Fabius,George Washington


In [7]:
# join the quote data with their corresponding labels
with_label = pd.merge(left=df, left_on="speaker", right=alias_to_label, right_on="speaker")
with_label = with_label.drop(columns=["speaker"])
with_label.head()

Unnamed: 0,quoteID,quotation,label
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Susan Dowdell Myrick
1,2020-01-16-012811,cancel contracts to train Saudi police and oth...,Susan Dowdell Myrick
2,2020-01-16-094237,to block the sale of sensitive military munitions,Susan Dowdell Myrick
3,2020-02-28-013721,enlisted the help of Hezbollah,Susan Dowdell Myrick
4,2020-02-28-034001,implies a Persian influence that can likely be...,Susan Dowdell Myrick


In [8]:
# join quotes with affiliations of their speakers
party_affiliations = df_meta_notna[["label", "party"]]
speaker_with_affiliation = pd.merge(left = with_label, left_on="label", right=party_affiliations, right_on="label")
speaker_with_affiliation.head()

Unnamed: 0,quoteID,quotation,label,party
0,2020-01-26-000499,a few of the candidates who will do better in ...,David Loebsack,Q29552
1,2020-01-26-040663,"The generational thing is important, quite hon...",David Loebsack,Q29552
2,2020-02-01-035378,Once I could stand in front of audiences and s...,David Loebsack,Q29552
3,2020-01-22-024629,He knows we have a problem of gun violence. He...,David Loebsack,Q29552
4,2020-01-12-041928,They can't be solved with the same political w...,David Loebsack,Q29552


In [10]:
# serialize into pickle
SPEAKER_WITH_AFFILIATION_PATH = "../data/binary/speaker_with_affiliation.pickle"
speaker_with_affiliation.to_pickle(SPEAKER_WITH_AFFILIATION_PATH)