In [2]:
import dask.bag as db
import dask.dataframe as df
import json

In [3]:
dataset_year = 2008
filename = f"quotes-{dataset_year}.json"
dataset = db.read_text(f"../quotebank/raw/{filename}", blocksize=150e6).map(json.loads)

In [4]:
def remove_probability_of_others_then_most(quote):
    del quote["probas"]
    return quote

In [5]:
def extract_probability_of_most_prob_speaker(quote):
    quote["speaker_prob"] = float(quote["probas"][0][1])
    return quote

In [6]:
def extract_only_one_qid(quote):
    quote = quote.copy()
    if len(quote["qids"]) == 0:
        quote["qids"] = None
    elif len(quote["qids"]) == 1:
        quote["qids"] = quote["qids"][0]
    else:
        quote["qids"] = "Multiple"
    return quote

In [7]:
import csv

In [8]:
domains_by_country_dict = {}

with open('datasets/GDELTDOMAINSBYCOUNTRY-MAY2018.TXT') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader, None) # skip header
    domains_by_country_dict = {rows[0]:rows[1] for rows in reader}
    

In [9]:
import tld

In [10]:
def convert_domain_to_country(quote, domains_by_country_dict):
    #add countries list to data based on the GDELT dataset
    countries_set = set()
    for url in quote["urls"]:
        domain_name = tld.get_fld(url)
        if domain_name in domains_by_country_dict:
            countries_set.add(domains_by_country_dict[domain_name])
    quote["url_countries"] = list(countries_set) if len(countries_set) != 0  else None
    return quote


In [11]:
dataframe = dataset.map(extract_probability_of_most_prob_speaker) \
                   .map(remove_probability_of_others_then_most) \
                   .map(extract_only_one_qid) \
                   .map(convert_domain_to_country, domains_by_country_dict) \
                   .to_dataframe().persist()

In [12]:
dataframe.to_parquet(f"../quotebank/parquet/quotes-{dataset_year}.parquet")

[None]

In [13]:
df.read_parquet(f"../quotebank/parquet/quotes-{dataset_year}.parquet").count().compute()

quoteID           4641330
quotation         4641330
speaker           4641330
qids              4198748
date              4641330
numOccurrences    4641330
urls              4641330
phase             4641330
speaker_prob      4641330
url_countries     4617942
dtype: int64

In [14]:
dataframe.count().compute()

quoteID           4641330
quotation         4641330
speaker           4641330
qids              4198748
date              4641330
numOccurrences    4641330
urls              4641330
phase             4641330
speaker_prob      4641330
url_countries     4617942
dtype: int64