# imports and data mountage

In [1]:
pip install --upgrade pandas
pip install --upgrade pyarrow



In [2]:
from google.colab import drive
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import numpy as np
import math
from tqdm import tqdm
import logging

pd.__version__

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
logging.basicConfig(level=logging.DEBUG, filename='/content/drive/MyDrive/quotes/quotes.log')

# read files

In [6]:
path_to_attr = '/content/drive/MyDrive/Project datasets/speaker_attributes.parquet'
path_to_quotbank_desc = '/content/drive/MyDrive/Project datasets/wikidata_labels_descriptions_quotebank.csv.bz2'
path_to_quotbank = '/content/drive/MyDrive/Quotebank/quotes-2020.json.bz2' 

In [8]:
columns = ['date_of_birth', 'nationality', 'gender', 'ethnic_group', 'occupation', 'party', 'academic_degree', 'id', 'candidacy', 'religion']
attr = pd.read_parquet(path_to_attr, columns=columns)

In [9]:
desc = pd.read_csv(path_to_quotbank_desc, compression='bz2', index_col='QID') 

# process data

## Profile

In [10]:
def to_profile(chunk_size, path_to_dataset):
  quotebank_reader = pd.read_json(path_to_quotbank, lines=True, compression='bz2', chunksize=chunk_size)
  columns = ['nationality', 'gender', 'ethnic_group', 'occupation', 'party', 'academic_degree', 'candidacy', 'religion'] 
  path_to_dataset = path_to_dataset 

  for i, quote in tqdm(enumerate(quotebank_reader)):
    #multiple people in one quote
    quote = quote.explode('qids')

    #marge with atributes
    quote = quote.merge(attr, how='left', left_on='qids', right_on='id', indicator=True)

    #decript values
    try:
      quote[columns] = quote[columns].applymap(lambda x: desc.loc[x]['Label'].values[0], na_action='ignore')
    except:
      logging.exception(f"on chunk {i} after {i*chunk_size}:")
      print(f"error {i}")
      continue

    #unify NaN to None
    quote = quote.where(pd.notnull(quote), None)

    #save
    table = pa.Table.from_pandas(quote, preserve_index=False)         
    pq.write_table(table, path_to_dataset + f"/{i}" + ".parquet")
    if i % 100 == 0:
      print((i+1)*chunk_size)

In [None]:
%%time
to_profile(10000,"/content/drive/MyDrive/quotes/quotes-2020-enhanced10000")

1it [00:35, 35.42s/it]

10000


12it [06:42, 31.86s/it]

error 11


18it [09:56, 30.72s/it]

error 17


26it [14:15, 31.83s/it]

error 25


27it [14:42, 30.33s/it]

error 26


28it [15:08, 29.19s/it]

error 27


33it [17:48, 31.14s/it]

error 32


39it [21:03, 31.86s/it]

error 38


47it [25:25, 32.02s/it]

error 46


52it [28:04, 31.03s/it]

error 51


59it [31:54, 32.15s/it]

error 58


64it [34:37, 31.85s/it]

error 63


68it [36:47, 31.80s/it]

error 67


83it [45:03, 31.14s/it]

error 82


### Check 

In [None]:
dataset = ds.dataset("/content/drive/MyDrive/quotes/quotes-2020-enhanced10000", format="parquet")
print(dataset.files)
df = dataset.head(100).to_pandas()
df.info()


In [None]:
df.head()