In [1]:
import pandas as pd
import numpy as np
from tld import get_tld
import tldextract
from tqdm.notebook import tqdm

import dask
import bz2
import json
import dask.dataframe as dd

In [2]:
path = "E:\\PycharmProject\\ada_project\\data\\speaker_attributes.parquet"

# load the speakers' qid data, 
wiki_data = pd.read_parquet(path, engine='pyarrow')
# we only takes speakers nationality, gender, and party into considerations.
wiki_data = wiki_data[['nationality','gender','party', 'id']]
wiki_data

Unnamed: 0,nationality,gender,party,id
0,"[Q161885, Q30]",[Q6581097],[Q327591],Q23
1,[Q145],[Q6581097],,Q42
2,[Q31],[Q6581097],,Q1868
3,[Q30],[Q6581097],[Q29468],Q207
4,[Q29],[Q6581097],,Q297
...,...,...,...,...
9055976,[Q30],[Q6581097],,Q106406560
9055977,[Q30],[Q6581097],,Q106406571
9055978,,[Q6581072],,Q106406588
9055979,,[Q6581072],,Q106406593


In [3]:
# extract urls domains, which could be used to obtain the news publishers.
def extract_domain(data):
    all_domains = []
    for idx, row in tqdm(data.iterrows()):  # for a single quotation
        domains = []
        for url in row['urls']:
            res=tldextract.extract(url)
            domain = res.domain
            domains.append(domain)
        all_domains.append(domains) 
    all_domains = pd.Series(all_domains)
    data['domains'] = all_domains
    
    return data

In [4]:
def map_qid2value(data):
    # map the qid to it's corresponding real value
    party_mapper = pd.read_csv('party_mapper.csv', index_col=0)
    party_mapper = party_mapper['Label'].to_dict()
    gender_mapper = pd.read_csv('gender_mapper.csv', index_col=0)
    gender_mapper = gender_mapper['Label'].to_dict()
    nationality_mapper = pd.read_csv('nationality_mapper.csv', index_col=0)
    nationality_mapper = nationality_mapper['Label'].to_dict()
    
    data.party.replace(party_mapper, inplace=True)
    data.gender.replace(gender_mapper, inplace=True)
    data.nationality.replace(nationality_mapper, inplace=True)
    return data

def merge_with_wiki(data, wiki_data):
    # merge the original quote-bank data with wiki-data
    
    # this function is used to extract the first elements in a column which may contains 
    # multiple values for each row.
    func = lambda x: list(x)[0] if x is not None and x is not np.nan else x
    
    # for speaker with multiple qids, we only take the first one as real value because we assume that 
    # wiki sort the qids based on their popularity, so the first one have large likely to be the ground truth qid.
    data['qids'] = data.qids.apply(func)
    merge_data = data.merge(wiki_data, left_on='qids', right_on='id', how='left')
    
    merge_data.party = merge_data.party.apply(func)
    merge_data.gender = merge_data.gender.apply(func)
    merge_data.nationality = merge_data.nationality.apply(func) 
    
    merge_data = map_qid2value(merge_data)

    return merge_data


We found that a large number of elements in speaker column are None, however, it provides nothing useful for analysis, so we filter it out. Besides, we notice that there are different forms for the name of Donald Trump, such as *President Donald Trump*, *President Trump* and so on. Therefore, We replace all names containing 'Trump' by 'Donald Trump' for consistency. But it's hard for us to deal with other situations because it is unlikely to find all different names which actually mean the same person.

In [5]:
def preprocess_data(data, wiki_data):
    # for simpification, we drop quotations that do not contains speakers 
    data = data[data['speaker'] != 'None']
    data = data.reset_index(drop=True)
    
    # these two columns may be useless for us.
    data = data.drop(['probas', 'phase'], axis=1)
    
    data['speaker'] = data['speaker'].apply(lambda x: 'Donald Trump' if 'Trump' in x else x)
    
    # Replace '[ ]' in the quotation by ''
    tmp = data['quotation'].str.replace('[', '')
    tmp = tmp.str.replace(']', '')
    data['quotation'] = tmp
    
    data = extract_domain(data)
    
    # urls now becomes useless for us
    data = data.drop(['urls'], axis=1)
    
    data = merge_with_wiki(data, wiki_data)
    
    return data

In [None]:
path = "E:\\PycharmProject\\ada_project\\data\\"
# chunk_list = []

# self defined data processing pipeline with chunk, 
# the original chunk methods provided bu pandas did not solve the OutofNemory error.
with bz2.open(path + 'quotes-2019.json.bz2', 'r') as f:
    lines = []
    chunk_size = 1e6
    chunk_num = 1
    for i, line in enumerate(f):
        dic = json.loads(line)
        lines.append(dic)
        if i > 0 and i % chunk_size == 0:
            print('*' * 10 + str(chunk_num) + '-th chunk' + '*' * 10)
            chunk = pd.DataFrame(lines)
            processed_chunk = preprocess_data(chunk, wiki_data)
            if chunk_num == 1:
                processed_chunk.to_csv(path + 'processed-quotes-2019.csv', mode='a', index=False)
            else:
                processed_chunk.to_csv(path + 'processed-quotes-2019.csv', mode='a', header=False, index=False)

            lines = []
            chunk_num += 1