In [1]:
# Explode appropriate parts of category data
# Implement categorization rules
# Test accuracy against existing categorization

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import glob
import re

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [4]:
Run = "../workproduct-files/batchRuns/"


In [5]:
RunMeta = pd.read_csv("../workproduct-files/batchRuns.csv", delimiter=";")


In [6]:
t_data = pd.read_pickle("../workproduct-files/t_dataMaster-keywordsIdentified.pkl")

In [7]:
RunMeta

Unnamed: 0,startIndex,stopIndex,startTime,endTime,runTime
0,-1,-1,,,
1,0,9,2020-10-03 15:57:52,2020-10-03 16:02:54,302.0
2,10,19,2020-10-03 16:02:54,2020-10-03 16:04:33,99.0
3,20,29,2020-10-03 16:04:33,2020-10-03 16:10:09,336.0
4,30,39,2020-10-03 16:10:09,2020-10-03 16:12:35,146.0
...,...,...,...,...,...
12770,64515,64519,2020-10-26 00:37:08,2020-10-26 00:38:54,106.0
12771,64520,64524,2020-10-26 00:38:54,2020-10-26 00:39:06,12.0
12772,64525,64529,2020-10-26 00:39:06,2020-10-26 00:39:54,48.0
12773,64530,64534,2020-10-26 00:39:54,2020-10-26 00:42:05,131.0


In [8]:
Run_time = RunMeta.iloc[:,-1].sum(skipna = True)

In [9]:
#Total run time in days
Run_time/3600/24

16.51244212962963

In [10]:
#Days at average speed per row
Run_time/RunMeta.iloc[-1,1]*65000/3600/24

16.630647656046452

In [11]:
Run_files = glob.glob(Run + "*.pkl")


In [None]:
%%time
RunDF = pd.DataFrame()
for filename in Run_files:
    RunDF = RunDF.append(pd.read_pickle(filename))
RunDF = RunDF.sort_index()

In [None]:
Run_wikisearchFailed = len(RunDF[RunDF.wikipediaSearchSuccessful == False])
Run_wikisearchFailed

In [None]:
def successIndex(a):
    try:
        result = a.index("SUCCESS")
    except ValueError:
        result = -1
    
    return result

In [None]:
RunDF["searchTermUsed_idx"] = RunDF["findQuestionCategories_meta"].apply(lambda x: successIndex(x))

In [None]:
%%time
for row in RunDF.index:
    if RunDF.loc[row, "searchTermUsed_idx"] != -1:
        RunDF.loc[row, "searchTermUsed"] = RunDF.loc[row, "searchTerms"][RunDF.loc[row, "searchTermUsed_idx"]]
    else:
        RunDF.loc[row, "searchTermUsed"] = None
    

In [None]:
RunDF

In [None]:
#Combine dataframes leaving out searchTerms from RunDF (duplicate in result)
t_dataRun = pd.concat([t_data, RunDF.iloc[:,1:]], axis = 1, join = "inner")

In [None]:
# Suboptimal solution, only for preanalyses. If excludes iloc operation on rows with value None
t_dataRun["WikipediaHighestCat"] = t_dataRun["categoryPath"].apply(lambda x: x.iloc[-2, 0] if (type(x) is not type(None)) else None)

In [None]:
t_dataRun["categoryPathList"] = t_dataRun["categoryPath"].apply(lambda x: list(reversed(x.iloc[:,0])) if (type(x) is not type(None)) else [])

In [None]:
t_dataRun["categoryPathList"]

In [None]:
# For each category level in max level count
# Add column with level data
for lvl in range(max(t_dataRun["categoryPathList"].apply(lambda x: len(x)))):
    t_dataRun["WikiCat_" + str(lvl)] = t_dataRun["categoryPathList"].apply(lambda x: x[lvl] if len(x) > lvl else None)

In [None]:
t_dataRun.columns

# CATEGORIZATION RULES

## Wikipedia path final category

In [None]:
# Wikipedia category level 1 mapping
# See spreadsheet https://docs.google.com/spreadsheets/d/1c4g4MujTgaqTn5wG5I-ARY06g7rlnHst0whojsFbps8/edit?usp=sharing
MAPPING_wiki_to_trivia = {'History':'History and society' ,'Reference':'None' ,'Law':'History and society' ,'Religion':'History and society' ,'Education':'History and society' ,'Philosophy':'Science and nature' ,'Politics':'History and society' ,'Science_and_technology':'Science and nature' ,'Sports':'Sports and hobbies' ,'Language':'Art and literature' ,'Nature':'Science and nature' ,'Geography':'Geography and places' ,'Arts':'Art and literature' ,'Society':'History and society' ,'Culture':'History and society' ,'Health':'Science and nature' ,'Mathematics':'Science and nature' ,'Universe':'Science and nature' ,'Events':'History and society' ,'Humanities':'Art and literature' ,'Life':'Science and nature' ,'Entertainment':'Entertainment and games' }


In [None]:
t_dataRun["WikiPath_Category"] = t_dataRun["WikiCat_1"].replace(MAPPING_wiki_to_trivia)

## Wikipedia path exceptions

In [None]:
# https://docs.google.com/spreadsheets/d/1c4g4MujTgaqTn5wG5I-ARY06g7rlnHst0whojsFbps8/edit#gid=2055637170
# Mapping list
exceptionCategories = [('3', 'Literature', 'Art and literature'),('2', 'Food_and_drink', 'Sports and hobbies'),('5', 'Plants', 'Science and nature'),('3', 'Ethnobiology', 'Science and nature'),('2', 'Artificial_objects', 'Science and nature'),('4', 'Diseases_and_disorders', 'Science and nature'),('2', 'Performing_arts', 'Art and literature'),('2', 'Gaming', 'Entertainment and games'),('4', 'Anthropology_of_religion', 'History and society'),('3', 'Leisure', 'Sports and hobbies'),('4', 'Physical_geography', 'Geography and places')]

In [None]:
# Take dataframe row as input
# If several exceptions apply, algorithm returns the one listed last in exceptionCategories
def findExceptionWikiCatRows(a):
    
    result = ""
    
    for exep in exceptionCategories:
        if a["WikiCat_" + str(exep[0])] == exep[1]:
            result = exep[2]
    
    return result
    

In [None]:
row = t_dataRun.iloc[4277]
findExceptionWikiCatRows(row)

In [None]:
%%time
t_dataRun["WikiPath_Exceptions"] = t_dataRun.apply(lambda x: findExceptionWikiCatRows(x), axis = 1)

In [None]:
%%time
# If exception exists, use that, else use WikiPath_Category
t_dataRun["WikiPath_withExceptions"] = t_dataRun.apply(lambda x: x["WikiPath_Exceptions"] if x["WikiPath_Exceptions"] != "" else x["WikiPath_Category"], axis = 1)

In [None]:
# t_dataRun.drop("WikiPath_Exceptions", axis = 1, inplace = True)

In [None]:
#crosstab_percentCols = pd.crosstab(t_dataRun.WikiPath_Category, t_dataRun.WikiPath_withExceptions, margins=True, normalize = "columns").round(3)*100
crosstab_percentCols = pd.crosstab(t_dataRun.WikiPath_Category, t_dataRun.WikiPath_withExceptions)
crosstab_percentCols.style.background_gradient(cmap='Blues', axis = 0)

In [None]:
t_dataRun["WikiPath_Exceptions"].groupby(t_dataRun["WikiPath_Exceptions"]).count()

## Key word exceptions

In [None]:
# Except specific search term / key word


### Category is History if Q / A contains number within range

In [None]:
# 1970 as upper bound seems better. Many questions regarding 1970's are better categorized elsewhere.
historyLowerBound = 1000
historyUpperBound = 1970

In [None]:
# Takes list of numbers as input
def withinHistoryRange(a):
    for x in a:
        try:
            if int(x) > historyLowerBound and int(x) < historyUpperBound:
                return True
        except:
            return "ERROR - Input was probably not a number."
    return False

In [None]:
%%time
t_dataRun["hasHistoryNumber"] = (t_dataRun["CONS_question"] + t_dataRun["CONS_answer"]).apply(lambda x: withinHistoryRange(re.findall(r'\d+', x)))

In [None]:
# NOTE: Categorization currently not impacted by presence of historical date

### Search term exceptions

In [None]:
# Search term mapping: https://docs.google.com/spreadsheets/d/1c4g4MujTgaqTn5wG5I-ARY06g7rlnHst0whojsFbps8/edit#gid=2055637170

In [None]:
termMappingList = {'capital':'Geography and places' ,'state':'Geography and places' ,'sport':'Sports and leisure' }

In [None]:
# Take search term list as input
# If several exceptions apply, algorithm returns the one corresponding to search term listed first
def findExceptionSearchTerms(a):
           
    for term in a:
        if term in termMappingList:
            return (termMappingList[term], term)
    
    return None
    

In [None]:
%%time
t_dataRun["searchTermCatExceptions"] = t_dataRun["searchTerms"].apply(lambda x: findExceptionSearchTerms(x))

In [None]:
t_dataRun["WikiPath_withExceptions"].unique()

In [None]:
t_dataRun.columns

### Combine original category and wikicategory fields

In [44]:
t_dataRun["CONS_category"].unique()

array(['Art and literature', 'Entertainment', 'Sports and leisure',
       'Geography and places', 'Science and nature',
       'History and society', 'Uncategorized'], dtype=object)

In [45]:
# Change category names to original category data
MAPPING_old_cat_to_new = {'Entertainment': 'Entertainment and games', 'Sports and leisure':'Sports and hobbies'}
t_dataRun["CONS_category"] = t_dataRun["CONS_category"].replace(MAPPING_old_cat_to_new)

In [46]:
t_dataRun["CONS_category"].unique()

array(['Art and literature', 'Entertainment and games',
       'Sports and hobbies', 'Geography and places', 'Science and nature',
       'History and society', 'Uncategorized'], dtype=object)

In [None]:
t_dataRun.groupby('CONS_category').count()

In [52]:
# Set Export_category to CONS_category if it has a value, otherwise to WikiPath_withExceptions
t_dataRun["Export_category"] = t_dataRun["CONS_category"]

In [53]:
t_dataRun.loc[t_dataRun.CONS_category == "Uncategorized", "Export_category"] = t_dataRun.loc[t_dataRun.CONS_category == "Uncategorized", "WikiPath_withExceptions"]

In [54]:
t_dataRun["Export_category"].unique()

array(['Art and literature', 'Entertainment and games',
       'Sports and hobbies', 'Geography and places', 'Science and nature',
       'History and society', None, 'None'], dtype=object)

In [None]:
"What was the world's first computer bug in 1946?"

In [62]:
t_dataRun.loc[t_dataRun.CONS_question == "What was the world's first computer bug in 1946?"]

Unnamed: 0,CONS_id,CONS_question,CONS_answer,CONS_alt answers,CONS_category,CONS_alt categories - NOT USED,CONS_type-formulation,CONS_type-multipleChoice,ORIG_id,ORIG_question,ORIG_answer,ORIG_alt answers,ORIG_category,ORIG_alt categories,ORIG_difficulty,ORIG_type,Source,Duplicate_removed,namedEntities,nouns,objects,subjects,nounsObjectsSubjects,searchTerms,wikipediaSearchSuccessful,findQuestionCategories_meta,wikipediaArticleTitle,wikipediaArticleID,categoryPath,parentCategories,searchTermUsed_idx,searchTermUsed,WikipediaHighestCat,categoryPathList,WikiCat_0,WikiCat_1,WikiCat_2,WikiCat_3,WikiCat_4,WikiCat_5,WikiCat_6,WikiCat_7,WikiCat_8,WikiCat_9,WikiCat_10,WikiCat_11,WikiCat_12,WikiCat_13,WikiCat_14,WikiCat_15,WikiCat_16,WikiPath_Category,WikiPath_Exceptions,WikiPath_withExceptions,hasHistoryNumber,searchTermCatExceptions,Export_category
34999,tdb_0x00925e,What was the world's first computer bug in 1946?,A moth,,Uncategorized,,Question,False,tdb_0x00925e,What was the world's first computer bug in 1946,0,[A moth],UNCATEGORIZED,,,,tdb,,"[(first, ORDINAL), (1946, DATE)]","[world, computer, bug]",[1946],[bug],"[[world, 431934249], [computer, 224177047], [bug, 39672754], [1946, 0]]","[1946, bug, computer, world]",True,"[Database call not successful (error), SUCCESS]",Bug_(Rügen),35022925,pages.title pages.id intersection union \ 0 Geography_of_Rügen 34025283 NaN NaN 1 Geography_of_Mecklenburg-Vorpommern 18098162 1.0 74.0 2 Geography_of_Germany_by_state 23907075 0.0 33.0 3 Geography_of_Germany 1062681 0.0 42.0 4 Geography_by_country 700174 0.0 268.0 5 Geography_by_place 5782300 0.0 261.0 6 Geography 693800 0.0 113.0 7 Main_topic_classifications 7345184 0.0 118.0 jaccard depth similarityBAC mostSimilar \ 0 NaN 7 NaN NaN 1 0.013514 6 5.5 True 2 0.000000 5 5.0 True 3 0.000000 4 4.0 True 4 0.000000 3 3.0 True 5 0.000000 2 2.0 Tru...,pages.title pages.id 0 Wittow 35023142 1 Peninsulas_of_the_Baltic_Sea 34208118 2 Peninsulas_of_Mecklenburg-Vorpommern 34208069 3 Spits_of_Europe 52608943 4 Coordinates_on_Wikidata 41138143 5 Geography_of_Rügen 34025283,1,bug,Geography,"[Main_topic_classifications, Geography, Geography_by_place, Geography_by_country, Geography_of_Germany, Geography_of_Germany_by_state, Geography_of_Mecklenburg-Vorpommern, Geography_of_Rügen]",Main_topic_classifications,Geography,Geography_by_place,Geography_by_country,Geography_of_Germany,Geography_of_Germany_by_state,Geography_of_Mecklenburg-Vorpommern,Geography_of_Rügen,,,,,,,,,,Geography and places,,Geography and places,True,,Geography and places


### TODO: Correct sentence case changes made in data cleaning phase

In [None]:
# If question or answer is sentence case and original is not, use original --> Check original implementation to ensure logic

### Beta data export

In [58]:
%%time
# Select n random rows of each category
out = t_dataRun.loc[:,["Export_category", "CONS_question", "CONS_answer"]]
out = out.loc[out["Export_category"] != "None"]
out = out.loc[out["Export_category"].notnull()] 
out = out.sample(frac=1).groupby('Export_category').head(400)

Wall time: 27.9 ms


In [59]:
print(out["Export_category"].unique())
print(len(out))

['Science and nature' 'Sports and hobbies' 'Entertainment and games'
 'Geography and places' 'History and society' 'Art and literature']
2400


In [60]:
print(out["Export_category"].unique())

['Science and nature' 'Sports and hobbies' 'Entertainment and games'
 'Geography and places' 'History and society' 'Art and literature']


In [61]:
out.to_csv ("../workproduct-files/beta-data-export_v2.csv", index = False, header=True)