In [1]:
%load_ext autoreload
%autoreload 2

import bz2
import json

import pickle

# import numpy
import pandas as pd

from helpers import *

In [2]:
# Read file in chunks
with pd.read_json(QUOTES_2020_PATH, lines=True, compression='bz2', chunksize=10000) as df_reader:
    for chunk in df_reader:
        break


df_quotes = chunk

In [3]:
# Read parquet Wikidata file
df_sa = pd.read_parquet(SPEAKER_ATTRIBUTES_PATH)

In [4]:
print(df_quotes.shape)
df_quotes.head()

(10000, 9)


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2020-01-28-000082,[ D ] espite the efforts of the partners to cr...,,[],2020-01-28 08:04:05,1,"[[None, 0.7272], [Prime Minister Netanyahu, 0....",[http://israelnationalnews.com/News/News.aspx/...,E
1,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,[Q367796],2020-01-16 12:00:13,1,"[[Sue Myrick, 0.8867], [None, 0.0992], [Ron Wy...",[http://thehill.com/opinion/international/4782...,E
2,2020-02-10-000142,... He (Madhav) also disclosed that the illega...,,[],2020-02-10 23:45:54,1,"[[None, 0.8926], [Prakash Rai, 0.1074]]",[https://indianexpress.com/article/business/ec...,E
3,2020-02-15-000053,"... [ I ] f it gets to the floor,",,[],2020-02-15 14:12:51,2,"[[None, 0.581], [Andy Harris, 0.4191]]",[https://patriotpost.us/opinion/68622-trump-bu...,E
4,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,[Q20684375],2020-01-24 20:37:09,4,"[[Meghan King Edmonds, 0.5446], [None, 0.2705]...",[https://people.com/parents/meghan-king-edmond...,E


In [5]:
def quotes_clean(df_quotes):
    df_quotes = df_quotes.drop('phase', axis=1).drop('urls', axis=1).drop('probas', axis=1)

    # Drop tables which don't have any qid
    df_has_qids = df_quotes[df_quotes['qids'].map(lambda x: len(x)) > 0].copy()

    # Pick 1st qid in qid list
    df_has_qids['top_qid'] = df_has_qids['qids'].map(lambda x: x[0])

    df_has_qids.drop('qids', axis=1, inplace=True)

    return df_has_qids

df_quotes_clean = quotes_clean(df_quotes)


In [6]:
df_quotes_clean

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,top_qid
1,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,2020-01-16 12:00:13,1,Q367796
4,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,2020-01-24 20:37:09,4,Q20684375
7,2020-01-17-000357,[ The delay ] will have an impact [ on Slough ...,Dexter Smith,2020-01-17 13:03:00,1,Q5268447
8,2020-04-02-000239,[ The scheme ] treats addiction as an illness ...,Barry Coppinger,2020-04-02 14:18:20,1,Q4864119
9,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson,2020-03-19 19:14:00,1,Q816459
...,...,...,...,...,...,...
9990,2020-02-17-085956,"whoever is the interested player,",Hardeep Singh Puri,2020-02-17 10:15:26,6,Q5655835
9995,2020-03-02-078576,"Why now, 2021? Here's why,",Michael J. Graham,2020-03-02 21:03:20,1,Q6831418
9996,2020-04-14-074488,Why should a running back be treated less than...,Mark Ingram,2020-04-14 22:03:37,1,Q452154
9997,2020-04-15-078413,"Why would I leave here where I'm healthy, to t...",Pauline Pearce,2020-04-15 10:10:33,1,Q55076551


In [56]:
REPUBLICAN_QID = 'Q29468'
DEMOCRAT_QID = 'Q29552'

# def filter_party(df_sa, party_qid):
#     df_has_party = df_sa[df_sa['party'].notna()]
#     return df_has_party[df_has_party['party'].map(lambda x: party_qid in x)]


def sa_keep_congress_members(df_sa):
    return df_sa[df_sa['US_congress_bio_ID'].notna()]


def sa_label_parties(df_sa):

    def label_party(parties):
        # TODO Figure out what to do if a person in a member of multiple parties at the same time
        if REPUBLICAN_QID in parties and DEMOCRAT_QID in parties:
            return None
        elif REPUBLICAN_QID in parties:
            return 'R'
        elif DEMOCRAT_QID in parties:
            return 'D'
        else:
            return None

    df_has_party = df_sa[df_sa['party'].notna()].copy()

    # Use function label_party to assign single letter party code to each row
    df_has_party['party_label'] = df_has_party['party'].map(label_party)

    # Drop any person that didn't get attributed a party_label
    df_has_party.dropna(subset=['party_label'], inplace=True)

    return df_has_party
    

def sa_keep_useful_stuff(df_sa):
    return df_sa[['label', 'id', 'party_label']]

# republicans = filter_party(df_sa, REPUBLICAN_QID)
# democrats = filter_party(df_sa, DEMOCRAT_QID)

In [59]:
df_sa_party_labeled = sa_label_parties(df_sa)

print(df_sa_party_labeled.shape)
df_sa_party_labeled.head()

(49997, 16)


Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion,party_label
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]",R
14,"[Mary Louise Streep, Meryl Louise Streep, Stre...",[+1949-06-22T00:00:00Z],[Q30],[Q6581072],1388232380,,,"[Q10800557, Q2259451, Q10798782, Q2405480, Q33...",[Q29552],,Q873,Meryl Streep,,item,[Q7066],D
57,"[Elisha Meredith, Elisha Edward Meredith]",[+1848-12-26T00:00:00Z],[Q30],[Q6581097],1395914283,,M000647,"[Q82955, Q40348]",[Q29552],,Q3251,Elisha E. Meredith,,item,,D
71,"[Thomas Matthew DeLonge, Thomas DeLonge, Ava D...",[+1975-12-13T00:00:00Z],[Q30],[Q6581097],1377404832,,,"[Q855091, Q177220, Q753110, Q488205, Q18921227]",[Q29552],,Q4270,Tom DeLonge,,item,,D
79,"[Willard Mitt Romney, Pierre Delecto]",[+1947-03-12T00:00:00Z],[Q30],[Q6581097],1393565531,,R000615,"[Q82955, Q15978655, Q43845, Q15980158, Q219477]",[Q29468],"[Q1765120, Q191701, Q1540185]",Q4496,Mitt Romney,"[Q937607, Q4226, Q4791860, Q17100322]",item,[Q42504],R


In [60]:
df_sa_party_labeled_congressmen = sa_keep_useful_stuff(sa_keep_congress_members(df_sa_party_labeled))

print(df_sa_party_labeled_congressmen.shape)
df_sa_party_labeled_congressmen.head()

(10261, 3)


Unnamed: 0,label,id,party_label
57,Elisha E. Meredith,Q3251,D
79,Mitt Romney,Q4496,R
196,Richard Nixon,Q9588,R
235,John Tyler,Q11881,D
236,James K. Polk,Q11891,D


In [63]:
df_sa_party_labeled = sa_keep_useful_stuff(df_sa_party_labeled)

In [64]:
df_quotes_clean.shape

(6575, 6)

In [70]:
df_merged = df_quotes_clean.merge(df_sa_party_labeled_congressmen, left_on='top_qid', right_on='id').drop(['top_qid', 'label'], axis=1)
df_merged

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,id,party_label
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,2020-01-16 12:00:13,1,Q367796,R
1,2020-01-26-000499,a few of the candidates who will do better in ...,Dave Loebsack,2020-01-26 13:21:36,11,Q771586,D
2,2020-01-26-040663,"The generational thing is important, quite hon...",Dave Loebsack,2020-01-26 13:21:36,11,Q771586,D
3,2020-01-20-000982,a host of other protections,Debbie Lesko,2020-01-20 15:32:48,1,Q16731415,R
4,2020-03-24-004650,And they are working towards delivering their ...,Mike Pompeo,2020-03-24 03:45:00,2,Q473239,R
...,...,...,...,...,...,...,...
332,2020-03-31-054725,The stimulus has failed. But anyone with commo...,Cory Gardner,2020-03-31 14:14:09,1,Q1135774,R
333,2020-02-23-033117,There are issues to resolve in the civil servi...,David Davis,2020-02-23 12:01:00,1,Q1174175,R
334,2020-04-16-056321,"Under my plan, we will put Americans back to w...",Jeff Sessions,2020-04-16 16:04:20,1,Q358443,R
335,2020-01-27-090364,We have to have somebody that wants to come in...,Cindy Axne,2020-01-27 00:00:00,31,Q58324150,D
