In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import bz2
import urllib
from datetime import datetime, date

data_folder = 'data/'

In [31]:
lexical_field = ['2nd amendment'] # To be continued

In [10]:
speakers = pd.read_parquet(data_folder + 'speaker_attributes.parquet')
label = pd.read_csv(data_folder + 'wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

In [32]:
path_to_file = data_folder + 'quotes-2015.json.bz2' 
path_to_out = data_folder + 'quotes-2015-extended.json.bz2'

iter = 0

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            iter += 1
            if any(substring in instance['quotation'] for substring in lexical_field) and instance['qids'] != []: #We keep only quotation containing words of the lexical field and where there is a speaker
                speaker = speakers.loc[speakers['id'] == instance['qids'][0]].squeeze()
                
                if speaker.nationality is not None: 
                    if any(substring == 'Q30' for substring in speaker.nationality): #keep american speaker

                        #We add the gender
                        instance['gender'] = 'male' if (speaker['gender'][0] == 'Q6581097') else 'female' 

                        #We add the nationalities
                        instance['nationality'] = []
                        for i in speaker['nationality']: 
                            nat = label.loc[i]['Label']
                            instance['nationality'].append(nat)

                        #We add the occupations
                        instance['occupation'] = []
                        for i in speaker['occupation']:
                            occ = label.loc[i]['Label']
                            instance['occupation'].append(occ)

                        #We add the date of birth
                        try:
                            born = datetime.strptime(speaker.date_of_birth[0][1:11], "%Y-%m-%d").date()
                            today = date.today()
                            age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
                        except:
                            age = None
                        instance['age'] = age

                        #We add the ethnic group -> problem here (NaN in the output file)
                        if speaker.ethnic_group is not None:
                            instance['ethnic_group'] = []
                            for i in speaker['ethnic_group']:
                                ethnic = label.loc[i]['Label']
                                instance['ethnic_group'].append(ethnic)
                        else:
                            instance['ethnic_group'] = None

                        #We add the party
                        if speaker.party is not None:
                            instance['party'] = []
                            for i in speaker['party']:
                                part = label.loc[i]['Label']
                                instance['party'].append(part)
                        else:
                            instance['party'] = None

                        #We add the religion
                        if speaker.religion is not None:
                            instance['religion'] = []
                            for i in speaker['religion']:
                                relig = label.loc[i]['Label']
                                instance['religion'].append(relig)
                        else:
                            instance['religion'] = None

                        d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file
  
            #to test
            if iter > 20000:
                break

In [33]:
def process_chunk(chunk):
    print(f'Processing chunk with {len(chunk)} rows')
    display(chunk)
    # print(chunk.quotation[1])
    # chunk['scores'] = chunk['quotation'].apply(lambda review: sid.polarity_scores(review))
    # chunk['compound']  = chunk['scores'].apply(lambda score_dict: score_dict['compound'])
    # for id, row in chunk.iterrows():
    #     print(chunk.quotation[id])
    #     print(chunk.compound[id])

with pd.read_json(data_folder + 'quotes-2015-extended.json.bz2', lines=True, compression='bz2', chunksize=10000) as df_reader:
    for chunk in df_reader:
        process_chunk(chunk)
        break