In [None]:
# imports
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import os
import bz2

In [None]:
def generate_speaker_affiliations(parquet_path, out_path, remove_raw=False):

    # load speaker info
    speaker_info = pd.read_parquet(parquet_path)
    speaker_info = speaker_info[["aliases", "party", "label"]]

    # take the speakers that have an assigned political affiliation
    speaker_info = speaker_info.dropna()

    # take the first affiliation only (TODO: this is most likely the best method, think what to do when someone has multiple affiliations)
    speaker_info["party"] = speaker_info["party"].apply(lambda x: x[0])

    # generate a mapping from all aliases to one single name
    speaker_info = speaker_info.explode("aliases")[["aliases", "label", "party"]]
    speaker_info = speaker_info.rename(columns = {"aliases": "speaker"})
    print(f"Speaker affiliation DF:\n {speaker_info.head()}")

    speaker_info.to_pickle(out_path)

    if remove_raw:
        os.remove(parquet_path)

In [None]:
def save_pickle(json_path_bz2, pickle_path, remove_raw=False):
    data = [] 
    columns = ['quoteID', 'quotation', 'speaker']
    with bz2.open(json_path_bz2, 'rb') as s_file:
        print("Quotation file opened...")
        count = 0
        for instance in tqdm(s_file):
            count += 1
            instance = json.loads(instance) # loading a sample
            if instance['speaker'] == "None":
                continue
            row = dict((k, instance[k]) for k in columns)
            data.append(row)
        df = pd.DataFrame(data)
        df.to_pickle(pickle_path)
    
    if remove_raw:
        os.remove(json_path_bz2)

In [None]:
def join_quotes_with_speaker_affiliations(df_quotes, df_affiliations, out_path):
    # join the quote data with their corresponding labels
    merged = pd.merge(left=df_quotes, left_on="speaker", right=df_affiliations, right_on="speaker")
    merged = merged.drop(columns=["speaker"])
    merged = merged.rename(columns = {"label": "speaker"})
    print(f"Merged DF: \n{merged.head()}")
    merged.to_pickle(out_path)

In [None]:
# one time operation - generate a pickle file containing speaker's affiliations
PARQUET_PATH = "../data/raw/speaker_attributes.parquet"
SPEAKER_AFFILIATIONS_OUT_PATH = "../data/binary/speaker_attributes.pickle"

print("Generating speaker affiliations DF...")
if not os.path.exists(SPEAKER_AFFILIATIONS_OUT_PATH):
    generate_speaker_affiliations(PARQUET_PATH, SPEAKER_AFFILIATIONS_OUT_PATH)
print("Done.\n")

# dataset loading - perform for each batch of the data (2015, 2016, ..., 2020)
years = [2020]
for year in years:
    DATASET_PATH_JSON_BZ2 = f"../data/raw/quotes-{year}.json.bz2"
    DATASET_PATH_PICKLE = f"../data/binary/quotes-{year}.pickle"
    MERGED_OUT_PATH = f"../data/binary/data-{year}.pickle"

    print("Generating quotes DF...")
    if not os.path.exists(DATASET_PATH_PICKLE):
        save_pickle(DATASET_PATH_JSON_BZ2, DATASET_PATH_PICKLE)
    print("Done.\n")

    df_quotes = pd.read_pickle(DATASET_PATH_PICKLE)
    df_affiliations = pd.read_pickle(SPEAKER_AFFILIATIONS_OUT_PATH)
    
    print("Generating merged df...")
    if not os.path.exists(MERGED_OUT_PATH):
        join_quotes_with_speaker_affiliations(df_quotes, df_affiliations, MERGED_OUT_PATH)
    print("Done.\n")