# Setup

In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
import html


file_tdb = "../edited-data/fixedJson.json"
file_opentdb = "../raw-data/openTDB api call, all data_2020-05-03.json"
file_tfyi = "../raw-data/triviafyi_2020-05-04.json"
file_rtg = "../raw-data/rtg dataset/rtg20522-1.json"


In [2]:
with open(file_tdb, encoding="utf8") as f:
        data = f.read()
tdb = pd.DataFrame(json.loads(data))

with open(file_opentdb, encoding="utf8") as f:
        data = f.read()
opentdb = pd.DataFrame(json.loads(data))

with open(file_tfyi, encoding="utf8") as f:
        data = f.read()
tfyi = pd.DataFrame(json.loads(data))

with open(file_rtg, encoding="utf8") as f:
        data = f.read()
rtg = pd.DataFrame(json.loads(data))

# Data cleanup

## RTG dataset (~20500 rows)

In [3]:
# CLEANING DATASET

# Dropping empty row with only data '_id' = 58a0f48f732c90c4fd3069ce

dropIndex = rtg.loc[rtg['_id'] == '58a0f48f732c90c4fd3069ce'].index
rtg.drop(dropIndex, inplace = True)
rtg.reset_index(drop=True, inplace = True)

# Transform 'categories' types from list to str. Checked that all rows have only 1 category in list.

rtg['categories'] = rtg['categories'].apply(lambda x: x[0])


In [4]:
# Edit created data to Pandas timestamp

rtg['created'] = pd.to_datetime(rtg['created'], unit = 'ms', errors = 'ignore')
rtg['created'] = pd.to_datetime(rtg['created'], errors = 'ignore')


## openTDB dataset (~3700 rows)

In [5]:
# Unescape html character encoding in 'question', 'correct_answer', and 'incorrect_answers'

In [6]:
def unescapeListContent(aList):
    ret = []
    for x in aList:
        ret.append(html.unescape(x))
    
    return ret

In [7]:
opentdb['question'] = opentdb['question'].apply(lambda x: html.unescape(x)) 
opentdb['correct_answer'] = opentdb['correct_answer'].apply(lambda x: html.unescape(x)) 
opentdb['incorrect_answers'] = opentdb['incorrect_answers'].apply(lambda x: unescapeListContent(x))

## TDB dataset (~45600 rows)

In [8]:
# CLEANING DATASET

# Removing rows with "category_id" column corrupt

tdb_errors = tdb.loc[tdb['category_id'].str.contains(';')]
tdb.drop(tdb_errors.index, inplace = True)
tdb.reset_index(drop = True, inplace = True)

## tFYI database (~1100 rows)

In [9]:
# CLEANING DATASET

# Change 'question' and 'answer' from list to string

tfyi['question'] = tfyi['question'].apply(lambda x: ''.join(x))
tfyi['answer'] = tfyi['answer'].apply(lambda x: ''.join(x))

# Save to pickle

In [10]:
tdb.to_pickle('../workproduct-files/cleaned-dataframes/tdb.pkl')
opentdb.to_pickle('../workproduct-files/cleaned-dataframes/opentdb.pkl')
tfyi.to_pickle('../workproduct-files/cleaned-dataframes/tfyi.pkl')
rtg.to_pickle('../workproduct-files/cleaned-dataframes/rtg.pkl')