#### ToDo
- Take dict formatted strings, add to separate list as dicts  - DONE
- Take error strings, add to separate list as strings - DONE
- Perform correction operations on errornous strings (see previous files)
- [Perform additional corrections as necessary - TBC]
- Combine to export list
- Export to JSON / Json lines with json.dumps

## Next: Add new dicts to 'As dict', document outcome per row of doubleQuoteCharConv() in one column, delete 'doubleQuoteCharConv - dict'


# Cleaning of data to output JSON master file

## Setup

In [1]:
import json
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

#file = './raw-data/Open-trivia-database-master/Open-trivia-database-master/en/todo/uncategorized.json'
folder = './raw-data/Open-trivia-database-master/Open-trivia-database-master/en/todo/'


## Function definitions

In [2]:
# Function that reads input file(s) from folder and creates df
def importData(x):
    directory = os.fsencode(x)
    dataListFull = []
    
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith('.json'):
            dataList = []
            with open(directory + file, encoding="utf8") as f:
                data = f.read()


            lineiterator = data.splitlines()
            for row in lineiterator:
                dataList.append(row.strip()[:-1])
            del dataList[0]
            del dataList[-1]
            dataListFull.extend(dataList)
        
    return dataListFull

In [3]:
# Function that converts string to dict using eval() when possible. Errors are handled in except and error text returned for log.
def dictConv1(x):
    try:
        return eval(x)
    except:
        return "ERROR-Incorrect formatting for dict conversion"

In [4]:
# Function that replaces double quote char with single quote within the question part of the broken JSON object.
# Returns dict if possible
# If error: returns string with potentially corrected object + info (corrected and dict success; corrected, still error; not corrected, still error)

def doubleQuoteCharConv(x):
    qStartStr = '"question":"'
    qEndStr = '", "answer"'
    stringUsed = x
    start = stringUsed.find(qStartStr) + len(qStartStr)
    end = stringUsed.find(qEndStr)
    correctionPerformed = "False"
    
    if stringUsed[start:end].find('"') != -1:
        correctionPerformed = "True"
    
    qCorrected = stringUsed[start:end].replace('"', "'")
    stringCorrected = stringUsed[:start] + qCorrected + stringUsed[end:]
    
    if correctionPerformed == 'True':
        return stringCorrected + '_correctionPerformed_' + correctionPerformed
    else:
        return '_correctionPerformed_' + correctionPerformed


In [5]:
# Function that saves index and best string of error strings to csv file for manual correction
def errorsToCsv(x):
    x.to_csv('./edited-data/errorsForManualCorr.csv', encoding='utf-8')

In [6]:
# Function that takes file with manually corrected strings, adds to df, converts to dict, updates log
# Save correction file in ./edited-data/corrected.csv
def correctionsToDf(x):
    dfTemp = pd.read_csv('./edited-data/corrected.csv', encoding='utf-8')
    dfTemp.set_index('Unnamed: 0', inplace = True)
    x['ManualCorrections'] = dfTemp['Best string']
    x.loc[x['ManualCorrections'].notnull() ,'ManualCorrections'] = x.loc[x['ManualCorrections'].notnull() ,'ManualCorrections'].apply(lambda y: dictConv1(y))

In [7]:
# Function that converts df to export form, exports to json
def exportJson(x):
    jsonData = json.dumps(x.tolist())
    
    with open("./edited-data/fixedJson.json", "w+") as f:
        f.write(jsonData)

## Opening data and running functions

In [8]:
# Open file
"""with open(file, encoding="utf8") as f:
    data = f.read()"""

'with open(file, encoding="utf8") as f:\n    data = f.read()'

In [9]:
# Read (potentially broken) JSON to list
"""dataList = []
lineiterator = data.splitlines()
for row in lineiterator:
    dataList.append(row.strip()[:-1])
del dataList[0]
del dataList[-1]"""


'dataList = []\nlineiterator = data.splitlines()\nfor row in lineiterator:\n    dataList.append(row.strip()[:-1])\ndel dataList[0]\ndel dataList[-1]'

In [10]:
# Run importData() function. Create pandas dataframe and add list data to "Original data str" column.

dataListFull = importData(folder)

df = pd.DataFrame(dataListFull)
df = df.rename(columns={0: "Original data str"})


In [11]:
# Creates 'As dict' master column in df. Runs dictConv1(), populates 'As dict' and saves outcome of function run in 'Run dictConv1'
df['Run dictConv1'] = df["Original data str"].apply(lambda x: dictConv1(x))
df['As dict'] = df['Run dictConv1'].where(df['Run dictConv1'] != 'ERROR-Incorrect formatting for dict conversion')

# Insert 'Success' in place of dict that was moved to 'As dict'
df.loc[df['Run dictConv1'] != 'ERROR-Incorrect formatting for dict conversion', 'Run dictConv1'] = 'SUCCESS'

# Change order of columns
df = df[["Original data str", "As dict", "Run dictConv1"]]

# Create 'Best string' column to store updated versions of data as string that cannot yet be converted to dict
df['Best string'] = df['Original data str']
# Create 'Row log' to store info of final row status
df['Row log'] = 'STR - No update done'
df.loc[df['Run dictConv1'] != 'ERROR-Incorrect formatting for dict conversion', 'Row log'] = 'DICT - Converted to dict on first attempt'

In [12]:
# Runs doubleQuoteCharConv(), adds new dicts to 'As dict'
df['Run doubleQuoteCharConv'] = df.loc[df['Run dictConv1'] != 'SUCCESS', 'Original data str'].apply(lambda x: doubleQuoteCharConv(x))
df[['doubleQuoteCharConv - str','doubleQuoteCharConv - perf']] = df['Run doubleQuoteCharConv'].str.split("_correctionPerformed_",expand=True)
df['doubleQuoteCharConv - str'].replace("", np.nan, inplace=True)
df['doubleQuoteCharConv - dict'] = df.loc[df['doubleQuoteCharConv - perf'] == 'True' ,'doubleQuoteCharConv - str'].apply(lambda x: dictConv1(x))


# 1. New dicts filtered from 'doubleQuoteCharConv - dict'
newDicts =  df.loc[df['doubleQuoteCharConv - dict'].notnull(), 'doubleQuoteCharConv - dict'].loc[df['doubleQuoteCharConv - dict'] != 'ERROR-Incorrect formatting for dict conversion']
# 2. New dicts put at right index to 'As dict'
df.loc[newDicts.index, 'As dict'] = newDicts
# 3. Update 'Best string' with rows not converted to dict
df.loc[df['doubleQuoteCharConv - perf'] == 'True', 'Best string'] = df.loc[df['doubleQuoteCharConv - perf'] == 'True', 'doubleQuoteCharConv - str']
# 4. Update logs
toDictsI = newDicts.index
newStringnoDictI = df.loc[df['doubleQuoteCharConv - dict'] == 'ERROR-Incorrect formatting for dict conversion', 'doubleQuoteCharConv - str'].index
df.loc[toDictsI, 'Row log'] = 'DICT - Corrected double quotes in q and converted to dict'
df.loc[newStringnoDictI, 'Row log'] = 'STR - Corrected double quotes in q - json format errors persist. Best string updated.'

In [13]:
# Select error rows and run errorsToCsv()
errorRowsforManual = df.loc[(df['Row log'] == 'STR - Corrected double quotes in q - json format errors persist. Best string updated.') | (df['Row log'] == 'STR - No update done') ,'Best string']
errorsToCsv(errorRowsforManual)

In [14]:
print(df.groupby('Row log').count()['Original data str'])

Row log
DICT - Converted to dict on first attempt                                                43985
DICT - Corrected double quotes in q and converted to dict                                 1621
STR - Corrected double quotes in q - json format errors persist. Best string updated.        6
STR - No update done                                                                        22
Name: Original data str, dtype: int64


## Run after manual corrections done and saved to ./edited-data/corrected.csv

In [15]:
# Run correctionsToDf(), update 'As dict' if possible and update logs
correctionsToDf(df)

df.loc[(df['ManualCorrections'].notnull()) & (df['ManualCorrections'] != '"ERROR-Incorrect formatting for dict conversion"') , 'As dict'] = df.loc[(df['ManualCorrections'].notnull()) & (df['ManualCorrections'] != '"ERROR-Incorrect formatting for dict conversion"') , 'ManualCorrections']
df.loc[(df['ManualCorrections'].notnull()) & (df['ManualCorrections'] != '"ERROR-Incorrect formatting for dict conversion"') , 'Row log'] = 'DICT - Manually corrected'
df.loc[(df['ManualCorrections'] == '"ERROR-Incorrect formatting for dict conversion"') , 'Row log'] = 'STR - Manually corrected, still error'

In [16]:
print(df.groupby('Row log').count()['Original data str'])

Row log
DICT - Converted to dict on first attempt                    43985
DICT - Corrected double quotes in q and converted to dict     1621
DICT - Manually corrected                                       28
Name: Original data str, dtype: int64


## Export usabel JSON

In [17]:
exportJson(df['As dict'])

# Output printing:

In [None]:
print(len(df))
print(df.groupby('Row log').count()['Original data str'])

In [None]:
#print(type(df['As dict'].tolist()))
jsonData = json.dumps(df['As dict'].tolist())

with open("./edited-data/uncategorized-fixed.json", "w+") as f:
    f.write(jsonData)

In [None]:
print(df.loc[df['ManualCorrections'].notnull() , 'ManualCorrections'])



In [None]:
df.loc[(df['ManualCorrections'].notnull()) & (df['ManualCorrections'] != '"ERROR-Incorrect formatting for dict conversion"') , 'As dict'] = df.loc[(df['ManualCorrections'].notnull()) & df['ManualCorrections'] != '"ERROR-Incorrect formatting for dict conversion"' , 'ManualCorrections']

In [None]:
print(df.loc[df['ManualCorrections'].notnull() , :])

In [None]:
print(df.loc[(df['Row log'] == 'STR - Corrected double quotes in q - json format errors persist. Best string updated.') | (df['Row log'] == 'STR - No update done') ,['Best string', 'Row log']])

In [None]:
print(df.loc['Row log'][:3])

In [None]:
dfTest = pd.read_csv('./edited-data/corrected.csv', encoding='utf-8')
dfTest.set_index('Unnamed: 0', inplace = True)

In [None]:
print(dfTest)

In [None]:
df['TEST'] = dfTest['Best string']

In [None]:
print(df.loc[1320:1340, :])

In [None]:
print(df.groupby('Row log').count()['Original data str'])

In [None]:
print(df.loc[df['doubleQuoteCharConv - dict'] == 'ERROR-Incorrect formatting for dict conversion', 'doubleQuoteCharConv - str'].index)
print(df.loc[df['doubleQuoteCharConv - perf'] == 'False', 'doubleQuoteCharConv - str'].index)

In [None]:
print(len(df))
print(len(df.loc[df['As dict'].isnull()]))
print(df.loc[df['As dict'].isnull()])

In [None]:
df.loc[df['doubleQuoteCharConv - dict'] == 'ERROR-Incorrect formatting for dict conversion', 'Best string'] = df.loc[df['doubleQuoteCharConv - dict'] == 'ERROR-Incorrect formatting for dict conversion', 'doubleQuoteCharConv - str']

In [None]:
print(df.loc[22998,'Original data str'])
print(df.loc[22998,'Best string'])

In [None]:
"""newDicts =  df.loc[df['doubleQuoteCharConv - dict'].notnull(), 'doubleQuoteCharConv - dict'].loc[df['doubleQuoteCharConv - dict'] != 'ERROR-Incorrect formatting for dict conversion']
print(len(newDicts))
print(type(newDicts.loc[64]))
print(newDicts.head(100))"""

newDicts =  df.loc[df['doubleQuoteCharConv - dict'].notnull(), 'doubleQuoteCharConv - dict'].loc[df['doubleQuoteCharConv - dict'] != 'ERROR-Incorrect formatting for dict conversion']
df.loc[newDicts.index, 'As dict'] = newDicts
#print(df.loc[newDicts.index, 'As dict'])

print(type(df.loc[0, 'As dict']))
print(type(df.loc[64, 'As dict']))

print(df['As dict'].head(100))

In [None]:
print(df.iloc[65, -1])
print(type(df.iloc[64, -1]))
print(len(df.loc[df['doubleQuoteCharConv - dict'].notnull()]))
print(len(df.loc[df['As dict'].notnull()]))
print(len(df.loc[df['As dict'].isnull()]))
print(len(df.loc[df['As dict'].notnull()]) + len(df.loc[df['As dict'].isnull()]))
print(len(df))
print('\n')
print(len( df.loc[df['doubleQuoteCharConv - dict'].notnull()] ))
print(len(df.loc[df['As dict'].isnull()]))
# print(len(df.loc[df['doubleQuoteCharConv - dict'].notnull()].loc[df['doubleQuoteCharConv - dict'] != 'ERROR-Incorrect formatting for dict conversion'])) - Byter ut alla, också de som är null

In [None]:
#print(df.iloc[1327, 5])
#print(type(df.iloc[1327, 5]))

print(df.columns.values)
print('\n')
print(df.loc[df['doubleQuoteCharConv - dict'] == "ERROR-Incorrect formatting for dict conversion"].iloc[1,0])
print(df.loc[df['doubleQuoteCharConv - dict'] == "ERROR-Incorrect formatting for dict conversion"].iloc[1,-3])
print('\n')
print(len(df.loc[df['doubleQuoteCharConv - dict'] == "ERROR-Incorrect formatting for dict conversion"]))
print(len(df.loc[df['doubleQuoteCharConv - dict'].notnull()]))

print(type(df['doubleQuoteCharConv - dict']))
#print(df.iloc[0, 3])
#print('\n')
#print(len(df.loc[df['Run doubleQuoteCharConv'].notnull()]))
#print(df.loc[df['Run doubleQuoteCharConv'].notnull()].head(100))

In [None]:
print(len(df.loc[df['doubleQuoteCharConv - dict'].notnull(), 'doubleQuoteCharConv - dict'].loc[df['doubleQuoteCharConv - dict'] != 'ERROR-Incorrect formatting for dict conversion']))
#print(df.loc[df['doubleQuoteCharConv - dict'].notnull(), 'doubleQuoteCharConv - dict'].loc[df['doubleQuoteCharConv - dict'] != 'ERROR-Incorrect formatting for dict conversion'].head(100))

print(df.loc[df['doubleQuoteCharConv - dict'].notnull(), 'doubleQuoteCharConv - dict'].loc[df['doubleQuoteCharConv - dict'] == 'ERROR-Incorrect formatting for dict conversion'])

#print(df.loc[df['doubleQuoteCharConv - dict'].notnull(), 'doubleQuoteCharConv - dict'].head(100))
print(len(df.loc[df['doubleQuoteCharConv - dict'].notnull(), 'doubleQuoteCharConv - dict']))

# Test etc. -->

In [None]:
testString = '{"category_id":"UNCATEGORIZED", "lang":"en", "tags":[], "question":"The word "cumulus" refers to a type of ___________."", "answer":0, "answers":["Cloud"], "source":""}'
testOut = doubleQuoteCharConv(testString)
print(testOut[:-25])
print(type(testOut))

print(type(dictConv1(testOut[:-25])))

In [None]:
#print(df["Original data str"].where(df['As dict'] == 'NaN').head(100))
print(len(df.loc[df['Run dictConv1'] != 'SUCCESS', 'Original data str']))
print(df.loc[df['Run dictConv1'] != 'SUCCESS', 'Original data str'].head(100))
#print(df["Original data str"].head(100))