In [None]:
# imports
import textstat as ts
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import os
import bz2
from statistics import mode
from datetime import datetime

### Getting politicians dataset
To obtain a dataset of politicians' quotations, we need to enrich the original *Quotebank* dataset with additional information about the speakers' political affiliations. To achieve that, we propose a preprocessing pipeline.

First, load the *Wikidata* table from the `.parquet` file (provided on the *Quotebank*'s Google Drive). Keep the columns containing information interesting for us - speaker's QID and label, and party's QID, discard the others and drop the rows with missing values. Convert that to a `pandas DataFrame` and dump it into a `.pickle` file. 

Then, for each of the years load the initial *Quotebank* data. The initial dataset is represented as a list of `json` objects - process it into a `DataFrame` in a line-by-line manner, only keeping the quotation content and QID, as well as the speaker's QID. Dump the dataframe into a `.pickle` file.

Next, perform an inner join between the two dataframes on the speaker QID - that way you end up with a dataframe containing quotations, along with information about the speaker's political affiliation.

The *Quotebank* dataset is divided into batches based on the quotation date - 6 files are corresponding to years from 2015 until 2020. Therefore we perform the pipeline explained above 6 times, and then merge the results into a single dataframe. We end up with a dataset of around 17m quotations.

In [None]:
def generate_speaker_affiliations(parquet_path, out_path, remove_raw=False):

    # load speaker info
    speaker_info = pd.read_parquet(parquet_path)
    speaker_info = speaker_info[["id", "label", "party"]]

    # take the speakers that have an assigned political affiliation
    speaker_info = speaker_info.dropna()

    # in case of multiple affiliations, take the first affiliation only
    # speaker_info["party"] = speaker_info["party"].apply(lambda x: int(x[0][1:]))

    # alternatively (I think a slightly better way), select most common party
    speaker_info["party"] = speaker_info["party"].apply(lambda x: mode(x)[1:])

    # transform speaker id into int
    speaker_info["id"] = speaker_info["id"].apply(lambda x: int(x[1:]))
    
    print(f"Speaker affiliation DF:\n {speaker_info.head()}")

    speaker_info.to_pickle(out_path)

    if remove_raw:
        os.remove(parquet_path)

In [None]:
def save_pickle(json_path_bz2, pickle_path, remove_raw=False):
    data = [] 
    with bz2.open(json_path_bz2, 'rb') as s_file:
        print("Quotation file opened...")
        for instance in tqdm(s_file):
            instance = json.loads(instance) # loading a sample
            
            # if there is no speaker, skip current row
            if not instance['qids']:
                continue
            
            # else proceed to read the data
            row = dict()
            row['speaker_id'] = int(instance['qids'][0][1:])
            row['quote_id'] = instance['quoteID']
            row['quotation'] = instance['quotation']
            data.append(row)

        df = pd.DataFrame(data)
        df.to_pickle(pickle_path)
    
    if remove_raw:
        os.remove(json_path_bz2)

In [None]:
def join_quotes_with_speaker_affiliations(df_quotes, df_affiliations, out_path):
    # join the quote data with their corresponding labels
    merged = pd.merge(left=df_quotes, left_on="speaker_id", right=df_affiliations, right_on="id")
    merged = merged.drop(columns=["id"])
    merged = merged.rename(columns = {"label": "speaker"})
    print(f"Merged DF: \n{merged.head()}")
    merged.to_pickle(out_path)

In [None]:
# one time operation - generate a pickle file containing speaker's affiliations
PARQUET_PATH = "../data/raw/speaker_attributes.parquet"
SPEAKER_AFFILIATIONS_OUT_PATH = "../data/binary/speaker_attributes.pickle"

print("Generating speaker affiliations DF...")
if not os.path.exists(SPEAKER_AFFILIATIONS_OUT_PATH):
    generate_speaker_affiliations(PARQUET_PATH, SPEAKER_AFFILIATIONS_OUT_PATH)
print("Done.\n")

# dataset loading - perform for each batch of the data (2015, 2016, ..., 2020)
years = []
for year in years:
    DATASET_PATH_JSON_BZ2 = f"../data/raw/quotes-{year}.json.bz2"
    DATASET_PATH_PICKLE = f"../data/binary/quotes-{year}.pickle"
    MERGED_OUT_PATH = f"../data/binary/data-{year}.pickle"

    print("Generating quotes DF...")
    if not os.path.exists(DATASET_PATH_PICKLE):
        save_pickle(DATASET_PATH_JSON_BZ2, DATASET_PATH_PICKLE)
    print("Done.\n")

    df_quotes = pd.read_pickle(DATASET_PATH_PICKLE)
    df_affiliations = pd.read_pickle(SPEAKER_AFFILIATIONS_OUT_PATH)
    
    print("Generating merged df...")
    if not os.path.exists(MERGED_OUT_PATH):
        join_quotes_with_speaker_affiliations(df_quotes, df_affiliations, MERGED_OUT_PATH)
    print("Done.\n")

In [None]:
# merge the results for the separate years into a single dataframe,
# and extracting date from the quote_id and saving it into a new column as datetime.
# Save all into a pickle named data.pickle

if not os.path.exists("../data/binary/data.pickle"):
    years = [2015, 2016, 2017, 2018, 2019, 2020]
    paths = [f"../data/binary/data-{year}.pickle" for year in years]
    dfs = [pd.read_pickle(path) for path in paths]
    merged_df = pd.concat(dfs, ignore_index=True)
    # create an additional datetime column - useful for temporal analysis
    merged_df["Date-Time"] = pd.to_datetime(merged_df['quote_id'].apply(lambda x : datetime.strptime(x[:10], '%Y-%m-%d')))
    merged_df.to_pickle("../data/binary/data.pickle")

### US Politicians dataset
To obtain a dataset of US Politician quotations, we proceed to:
1. Only keep the quotations where the speaker's party is `29468` (Republican party) or `29552` (Democratic party).
2. Filter out the rows that have `None` as a value in the `Candidacy` column in the Wikidata - the majority of the speakers affiliated with the political parties were not actual politicians - they are often celebrities, sports stars, TV personalities, etc. We believe it is beneficial to only take the actual politicians, as they are more likely to speak about actual political matters and represent their party's ideology.

Performing step 1 reduces the size of the dataset from 17 million rows to around 8 million rows, and step 2 reduces the size further to around 1.6 million.

In [None]:
# additional preprocessing to obtain the US politicans data
df = pd.read_pickle("../data/binary/data.pickle")
# filter to only keep the american politicians
df = df[df["party"].isin([29468, 29552])]

In [None]:
# maintain only the quotes where the speakers are actually politicians - they were candidates in at least one election
speaker_data = pd.read_parquet(PARQUET_PATH)
candidates = speaker_data[["id", "candidacy"]]
candidates = candidates.dropna()
candidates = candidates.drop(columns=["candidacy"])
candidates["id"] = candidates["id"].apply(lambda x: int(x[1:]))
candidates.head()

In [None]:
# filter to only keep the ones that actually participated in an election (exclude celebrities etc.)
df = pd.merge(left=df, left_on="speaker_id", right_on="id", right=candidates)
df = df.drop(columns=["id"])
print(df.head())

In [None]:
# save to pickle
if not os.path.exists("../data/binary/us-politicians.pickle"):
    df.to_pickle("../data/binary/us-politicians.pickle")