In [1]:
# ADD
# Batch run functionality
    # Take rows not yet processed, run in batches, save after each batch
# Improve cleaned format of final dataframe (column names and contents)
# Write final category guess rules

### Setup

In [50]:
import pandas as pd
from string import punctuation, digits
import re
import ast
import stopit
from py2neo import *
import time

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

#### Keyword to article title mapping data

In [4]:
%%time
redirects = pd.read_csv("F:/wikipedia-data/outputs/redirect.csv")

Wall time: 9.76 s


In [5]:
%%time
articles = pd.read_csv("F:/wikipedia-data/outputs/articles.csv")

Wall time: 2min 41s


In [6]:
%%time
articles.dropna(subset = ["title"], inplace = True)
redirects.dropna(subset = ["title"], inplace = True)

Wall time: 10.8 s


In [7]:
%%time
articles["titleLower"] = articles["title"].apply(lambda x: x.lower())
redirects["titleLower"] = redirects["title"].apply(lambda x: x.lower())

Wall time: 8.94 s


#### Graph database connection

In [8]:
# Connect to neo4j database - start database separately
graph = Graph()

In [9]:
# Set timeout-limit in seconds for database calls
maxSearchTime = 120

#### Trivia data

In [10]:
%%time
t_data = pd.read_pickle("../workproduct-files/t_dataMaster-keywordsIdentified.pkl")

Wall time: 1.2 s


### Functions

#### Search term to wikipedia article name linking

In [11]:
#Returns wikipedia article formatted for database search, if not found, returns FALSE
def inArticles(a):
    match = articles.loc[articles["titleLower"] == a.lower(), :]
    if len(match) > 0:
        return match.iloc[0, 1].replace(" ", "_")
    else:
        return False

In [12]:
#Returns wikipedia article formatted for database search, if not found, returns FALSE
def inRedirects(a):
    match = redirects.loc[redirects["titleLower"] == a.lower(), :]
    if len(match) > 0:
        return match.iloc[0, 2].replace(" ", "_")
    else:
        return False

In [13]:
#Get first link from article based on title (DB formatting). Return False if no links exist
def getFirstLink(a):
    #match will be a pandas series of len=1
    match = articles.loc[articles["title"] == a.replace("_", " "), "links"]
    
    if len(match) > 0:
        #Change first series value into list
        asList = ast.literal_eval(match.iloc[0]) 
        #result = asList[0].replace(" ", "_")
        result = asList[0]
        
        #Take string only until |
        result = re.sub("(\|)(.+)", '', result)
        result = re.sub("(\|)", '', result)
        
        return result
    else:
        return False

#### Wikipedia article name to neo4j database calls

In [14]:
# Call database for category tree and parents of given wikipedia title
@stopit.threading_timeoutable(default='Database call timed out (' + str(maxSearchTime) + ' seconds)')
def getCategoryInfo(a):
    # kill function if runs to long (>2min ?)
        # https://stackoverflow.com/questions/14920384/stop-code-after-time-period
        # https://pypi.org/project/stopit/#id14
    
    #result [wikipediID, path to MTC, parents]
    result = []
    
    try:
        articleID = articleByTitle(a).iloc[0,1]
        parents = parentCategories(articleID)
        
        if "Disambiguation_pages" in parents["pages.title"].values:
            firstLink = getFirstLink(a)
            return getWikipediaInfo(firstLink)
        else:
            path = chosenPathArticleToMTC(articleID)
        
        return [a, articleID, path, parents]
    
    except (IndexError, ValueError, TypeError, ClientError):
        return "Database call not successful (error)"
        
        

In [15]:
# Performs search functions from given search term --> Output from wikipedia database
def getWikipediaInfo(a):
    
    term = a.lower()
    
    out = inArticles(term)
    if out != False:
        return getCategoryInfo(out, timeout = maxSearchTime)
    
    out = inRedirects(term)
    if out != False:
        return getCategoryInfo(out, timeout = maxSearchTime)
    
    return "Search term not found"
        
    
    # if search term is in articles
        # Perform database search
        # Return (WikipediaID, Category tree, Parent categories)
    # else if search term is in redirects
        # Perform database search
        # Return (WikipediaID, Category tree, Parent categories)
    # else return FALSE

#### neo4j database calls

##### Return node info based on wikipedia id

In [16]:
def nodeInfo(a):
    commandToRun = 'MATCH (pages:Page {id: %s}) \
                RETURN pages' % (a)
    return graph.run(commandToRun).data()

##### Return similarity statistics for two sets (intersection, union, Jaccard coefficient)

In [17]:
# compute similarity statistics
def similarityStats(a,b):
    intSize = len(a.intersection(b))
    unionSize = len(a.union(b))
    
    if unionSize == 0:
        jaccard = 0
    else:
        jaccard = intSize / unionSize
    
    return (intSize, unionSize, jaccard)

##### Return identifying information of parent categories of chosen article or category as pandas dataframe

In [18]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentCategories(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Category:Page) \
                <-[:BELONGS_TO]- \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    # ensure that a dataframe with correct columns is returnd also when query is empty
    out = pd.DataFrame(columns = ["pages.title", "pages.id"])
    out = out.append(graph.run(commandToRun).to_data_frame())
    
    return out


##### Return identifying information of children (both category and article) of chosen category as pandas dataframe

In [19]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def childPages(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Page) \
                -[:BELONGS_TO]-> \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    # ensure that a dataframe with correct columns is returnd also when query is empty
    out = pd.DataFrame(columns = ["pages.title", "pages.id"])
    out = out.append(graph.run(commandToRun).to_data_frame())
    
    return out

##### Return dataframe with all articles (but not categories) with given title [only one result is expected]

In [20]:
# Give wikipedia title as string as function argument
# Will not work if article title contains "-character --> "ClientError". Escape fixes do not work, not worth debugging.
def articleByTitle(a):
    ArticleToFind = a
    commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                    WHERE NONE(art IN [articles] WHERE art:Category) \
                    RETURN articles.title, articles.id, ID(articles)' % (ArticleToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe with all categories (but not articles) with given title [only one result is expected]

In [21]:
# Give wikipedia title as string as function argument
def categoryByTitle(a):
    CategoryToFind = a
    commandToRun = 'MATCH (categories:Category:Page {title: "%s"}) \
                    RETURN categories.title, categories.id, ID(categories)' % (CategoryToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe containing shortest path between input node (article or category) and Main_topics_classification category

In [22]:
def shortestPathToMTC(a):
    # (:Page {id: 7345184}) is Main_topics_classifications category node
    inputNode = a

    commandToRun = 'MATCH path=shortestPath( \
                    (:Page {id: %s})-[:BELONGS_TO*0..10]->(:Page {id: 7345184})) \
                    UNWIND nodes(path) AS pages \
                    RETURN pages.title, pages.id, ID(pages)' % (inputNode)

    return pd.DataFrame(graph.run(commandToRun).data())

##### Return similarity value between two categories as defined by Biuk-Aghai & Cheang (2011)

In [23]:
# Take as input tuple containing depth of category to compare to as well as intersection of children between two categories

'''
Given parent category p and child category c,
and given a root category node r, we calculate the category similarity
Sp;c as: Sp;c = Dc - Cp;c / k , where Dc is the depth of category
c in the category graph, i.e. the shortest distance from the root
category node r; Cp;c is the number of co-assigned articles of categories
p and c; and k is a constant that is empirically determined.
Through experimentation we have found that a value of k = 2 produces
the best results, i.e. results that agree with human intuition as
to similarity of a given pair of categories. A smaller value of Sp;c
indicates a greater similarity (i.e. a smaller distance between the
nodes). The number of co-assigned articles Cp;c of parent category
p and child category c is simply the cardinality of the intersection
of their assigned article sets: Cp;c = jAp \ Acj, where Ap and Ac
are the sets of articles assigned to categories p and c, respectively.
'''
# Depth is calcualted for parent when going bottom-up in graph
# C is calculated using intersection of both child articles and sub-categories

def similarityBAC(a):
    d = a[0]
    c = a[1]
    k = 2
    return d - (c/k)

##### Return dataframe containing all parent categories of category a and similarity statistics to each as well as parent depth to MTC

In [24]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentSimilarities(a):
    parents = parentCategories(a)
    children = childPages(a)
    
    if len(parents) == 0:
        raise ValueError("Category processed does not have parents (likely input category to chosenPathUpToMTC() if called)")
    
    # Create columns with similarity stats using functions similarityStats
    parents["similarities"] = parents["pages.id"].apply(lambda x: similarityStats(set(children["pages.id"]), set(childPages(x)["pages.id"])))
    parents[["intersection", "union", "jaccard"]] = pd.DataFrame(parents['similarities'].tolist(), index = parents.index)
    parents.drop(["similarities"], axis = 1, inplace = True)
        
    # Add column with parent category depth (steps to Main_topics_classifications node)
    parents["depth"] = parents["pages.id"].apply(lambda x: len(shortestPathToMTC(x))-1)
    
    # Add column with similarityBAC
    parents["similarityBAC-aid"] = list(zip(parents["depth"], parents["intersection"]))
    parents["similarityBAC"] = parents["similarityBAC-aid"].apply(lambda x: similarityBAC(x))
    parents.drop(["similarityBAC-aid"], axis = 1, inplace = True)
    
    # Sort ascending
    parents.sort_values(by = "similarityBAC", ascending = True, inplace = True)
    parents.reset_index(drop = True, inplace = True)
    
    return parents

##### Return node based on neo4j database ID [NOTE: not same as wikipedia ID used elsewhere]

In [25]:
def getWithNeoID(a):
    return NodeMatcher(graph).get(a)

##### Return dataframe containing info of what category to choose from parentSimilarities() output

In [26]:
'''
we choose which parent link to keep according to
following rules: (1) Choose the parent whose similarity value Sp;c
is lower; (2) If Sp1;c = Sp2;c, choose the parent whose depth D is
lower; (3) If Dp1 = Dp2, choose the parent with the larger value
of Cp;c; (4) If Cp1;c = Cp2;c, choose the parent with the lower
page ID.
'''
# Takes parentSimilarities() / or potentially child similarities output dataframe as input
def chooseCategoryPath(a):
    a.sort_values(by = ["similarityBAC", "depth"], ascending = True, inplace = True)
    a["mostSimilar"] = "False"
    a["comment"] = ""
    
    # Set value for mostSimilar to "Not connected" for rows with depth = -1 i.e. no connection to MTC
    a.loc[a["depth"] == -1, "comment"] = "Not connected"
    
    # Set value for mostSimilar to "True" for rows that are not "Not connected" and that have the minimum value of similarityBAC
    workingDF = a.loc[a["comment"] != "Not connected"]
    selectedIndexes = workingDF.loc[workingDF["similarityBAC"] == workingDF["similarityBAC"].min()].index
    
    a.loc[selectedIndexes, "mostSimilar"] = "True"
    a.loc[selectedIndexes, "comment"] = "Lowest similarityBAC"
    
    workingDF = a.loc[a["mostSimilar"] == "True"]
    
    if len(workingDF) > 1:
        # Set all mostSimilar of partial dataframe and output back to False, then set min depth rows to true
        workingDF["mostSimilar"] = "False"
        a["mostSimilar"] = "False"
        selectedIndexes = workingDF.loc[workingDF["depth"] == workingDF["depth"].min()].index
        
        a.loc[selectedIndexes, "mostSimilar"] = "True"
        a.loc[selectedIndexes, "comment"] = a.loc[selectedIndexes, "comment"] + "; Lowest depth"
        
        workingDF = a.loc[a["mostSimilar"] == "True"]
        
        # If several rows now set to true, test for highest intersection
        if len(workingDF) > 1:
            # Set all mostSimilar of partial dataframe and output back to False, then set max intersection rows to true
            workingDF["mostSimilar"] = "False"
            a["mostSimilar"] = "False"
            selectedIndexes = workingDF.loc[workingDF["intersection"] == workingDF["intersection"].max()].index
            
            a.loc[selectedIndexes, "mostSimilar"] = "True"
            a.loc[selectedIndexes, "comment"] = a.loc[selectedIndexes, "comment"] + "; Highest intersection"
            
            workingDF = a.loc[a["mostSimilar"] == "True"]
            
            # If several rows now set to true, choose row with lowes pages.id
            if len(workingDF) > 1:
                # Set all mostSimilar of partial dataframe and output back to False, then set min wikipedia id row (only one) to true
                workingDF["mostSimilar"] = "False"
                a["mostSimilar"] = "False"
                selectedIndexes = workingDF.loc[workingDF["pages.id"] == workingDF["pages.id"].min()].index
                
                a.loc[selectedIndexes, "mostSimilar"] = "True"
                a.loc[selectedIndexes, "comment"] = a.loc[selectedIndexes, "comment"] + "; Lowest wikipedia id"
    
    
    return a

##### Return dataframe containing info of chosen path to MTC (iterates chooseCategoryPath() upwards)

In [27]:
# Iterate chooseCategoryPath() from input category (wikipedia id as input) until MTC is reached. Return dataframe with chosen path rows
# Root node category "Main_topic_classifications" has pages.id = 7345184

# NOTE: Error if input category does not have parents
def chosenPathUpToMTC(a):
    mtcFound = False
    nextStep = a
    chosenPath = pd.DataFrame()
    
    while(not mtcFound):
        allParents = parentSimilarities(nextStep)
        allParents = chooseCategoryPath(allParents)
        
        # If allParents contains MTC category
        if(len(allParents.loc[allParents["pages.id"] == 7345184]) == 1):
            rowToAppend = allParents.loc[allParents["pages.id"] == 7345184]
            mtcFound = True
        else:
            rowToAppend = allParents.loc[allParents["mostSimilar"] == "True"]
            nextStep = int(allParents.loc[allParents["mostSimilar"] == "True", "pages.id"])
        
        chosenPath = chosenPath.append(rowToAppend)
        chosenPath.reset_index(drop = True, inplace = True)
    
    
    return chosenPath

##### Article strength calculations

In [28]:
# Return dataframe with all pages linking to or from input page
def linksBetween(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Page) \
                -[:LINKS_TO]- \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    # ensure that a dataframe with correct columns is returnd also when query is empty
    out = pd.DataFrame(columns = ["pages.title", "pages.id"])
    out = out.append(graph.run(commandToRun).to_data_frame())
    
    return out


In [29]:
# a as pages.id for artice, c as pages.id for parent category
def articleClassificationStrength(a, c):
    aLinks = set(linksBetween(a)["pages.id"])
    cChildren = set(childPages(c)["pages.id"])
    
    intersectionSize = len(aLinks.intersection(cChildren))
    
    return 1 + intersectionSize

In [30]:
def strongestArticleParents(a):
    parents = parentCategories(a)
    parents["depth"] = parents["pages.id"].apply(lambda x: len(shortestPathToMTC(x)) -1 )
    parents.loc[parents["depth"] != -1 , "Strength"] = parents["pages.id"].apply(lambda x: articleClassificationStrength(a, x))
    parents.sort_values(by = ["Strength"], ascending = False, inplace = True)
   
    
    return parents

In [31]:
def chosenPathArticleToMTC(a):
    strongestParent = strongestArticleParents(a)
    path = chosenPathUpToMTC(strongestParent.iloc[0,1])
    
    path.loc[-1] = strongestParent.iloc[0, :3]
    path.sort_index(inplace = True)
    path.reset_index(drop = True, inplace = True)
    
    return path

# Run "question to wikipedia category" analyses

In [32]:
# Write batch processing


In [33]:
# Take list with search terms
# Run getWikipediaInfo() until search term works
# If result is False (search term not found) or "Database call not successful (error)" (search term found but path to MTC not available)
    # --> continue to next search term
# Return [(categoriesFound (boolean)), [n x (search term used, result), [1 x successful output]]]

def findQuestionCategories(a):
    categoriesFound = False
    result = []
    getWikipediaInfo_out = [None, None, None, None]
    toReturn = []
    possibleFailureMessages = ('Search term not found', 'Database call timed out (' + str(maxSearchTime) + ' seconds)', 'Database call not successful (error)')
    
    for term in a:
        termResult = getWikipediaInfo(term)        
        
        if termResult not in possibleFailureMessages:
            categoriesFound = True
            # CHANGE: NOT NECESSARY TO SAVE TERM
            result.append( ("SUCCESS") )
            getWikipediaInfo_out = termResult
            break
        else:
            result.append( (termResult) )
    
    if len(result) == 0:
        result.append( ("NO SEARCH TERMS GIVEN") )
    
    # Insert categorieFound  at start of toReturn
    toReturn.append(categoriesFound)
    toReturn.append(result)
    toReturn.append(getWikipediaInfo_out)
    
    return toReturn
       
        

In [40]:
use = t_data.iloc[0:3]

In [41]:
%%time
# Run time for 7 rows: 38,2 seconds
# Run time for 65 rows: 7min 31s
# Run time for 108 rows: 13min 49s
use["findQuestionCategories_Out"] = use["searchTerms"].apply(lambda x: findQuestionCategories(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workingDF["mostSimilar"] = "False"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workingDF["mostSimilar"] = "False"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workingDF["mostSimilar"] = "False"


Wall time: 1min 19s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [42]:
# Explode result from findQuestionCategories() into columns: 'wikipediaSearchSuccessful', 'usedSearchTerm', 'wikipediaArticleTitle', 'wikipediaArticleID', 'categoryPath', 'parentCategories'
use[['wikipediaSearchSuccessful','findQuestionCategories_meta', 'findQuestionCategories_result']] = pd.DataFrame(use["findQuestionCategories_Out"].tolist(), index= use.index)
use.drop(columns = ["findQuestionCategories_Out"], inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [43]:
use[["wikipediaArticleTitle", "wikipediaArticleID", "categoryPath", "parentCategories"]] = pd.DataFrame(use["findQuestionCategories_result"].tolist(), index= use.index)
use.drop(columns = ["findQuestionCategories_result"], inplace = True)

In [44]:
use

Unnamed: 0,CONS_id,CONS_question,CONS_answer,CONS_alt answers,CONS_category,CONS_alt categories - NOT USED,CONS_type-formulation,CONS_type-multipleChoice,ORIG_id,ORIG_question,ORIG_answer,ORIG_alt answers,ORIG_category,ORIG_alt categories,ORIG_difficulty,ORIG_type,Source,Duplicate_removed,namedEntities,nouns,searchTerms,wikipediaSearchSuccessful,findQuestionCategories_meta,wikipediaArticleTitle,wikipediaArticleID,categoryPath,parentCategories
0,tdb_0x000000,"""Now is the winter of our discontent"" is a line from which Shakespearian play?",Richard III,"[Romeo and Juliet, Macbeth]",Art and literature,,Question,True,tdb_0x000000,"""Now is the winter of our discontent"" is a line from which Shakespearian play?",0,"[Richard III, Romeo and Juliet, Macbeth]",ART_AND_LITERATURE,,,,tdb,,"[(winter, DATE), (Shakespearian, NORP)]","[[Now is the winter of our discontent, 0], [line, 280009597], [play, 150748333]]","[Now is the winter of our discontent, play, line]",True,[SUCCESS],Richard_III_(play),176961,pages.title pages.id intersection union jaccard depth \ 0 English_Renaissance_plays 1034365 NaN NaN NaN 5 1 English_plays 3258485 3.0 703.0 0.004267 5 2 British_plays 1928104 10.0 759.0 0.013175 5 3 British_drama 1376798 3.0 534.0 0.005618 4 4 Drama_by_nationality 25776489 0.0 63.0 0.000000 3 5 Drama 709071 0.0 115.0 0.000000 2 6 Theatre 698600 8.0 225.0 0.035556 2 7 Performing_arts 991222 16.0 258.0 0.062016 2 8 Entertainment 693016 15.0 253.0 0.059289 1 9 Main_topic_classifications 7345184 0.0 148.0 0.000000 0 similarityBAC mostSimilar comment 0 NaN NaN ...,pages.title pages.id 0 Use_British_English_from_February_2013 38379146 1 Plays_about_English_royalty 42128233 2 Cultural_depictions_of_English_monarchs 15621221 3 Shakespearean_histories 736938 4 British_plays_adapted_into_films 51477341 5 Richard_III_of_England 33466348 6 English_Renaissance_plays 1034365 7 Use_dmy_dates_from_February_2013 38378953
1,tdb_0x000001,"""Our Town"" is a play by whom?",Thornton Wilder,,Art and literature,,Question,False,tdb_0x000001,"""Our Town"" is a play by whom?",0,[Thornton Wilder],ART_AND_LITERATURE,,,,tdb,,[],"[[Our Town, 0], [play, 150748333]]","[Our Town, play]",True,[SUCCESS],Our_Town,62695,pages.title pages.id intersection union \ 0 Pulitzer_Prize_for_Drama-winning_works 34794170 NaN NaN 1 Theatre_award_winners 7129331 0.0 108.0 2 Theatre_awards 1975750 0.0 35.0 3 Theatre 698600 0.0 147.0 4 Performing_arts 991222 16.0 258.0 5 Entertainment 693016 15.0 253.0 6 Main_topic_classifications 7345184 0.0 148.0 jaccard depth similarityBAC mostSimilar \ 0 NaN 5 NaN NaN 1 0.000000 4 4.0 True 2 0.000000 3 3.0 True 3 0.000000 2 2.0 True 4 0.062016 2 -6.0 True 5 0.059289 1 -6.5 True 6 0.000000 0 0.0 Fals...,pages.title pages.id 0 Drama_Desk_Award-winning_plays 16736972 1 American_plays_adapted_into_films 51476012 2 Tony_Award-winning_plays 18147375 3 Plays_by_Thornton_Wilder 24334082 4 Broadway_plays 2133987 5 Pulitzer_Prize_for_Drama-winning_works 34794170 6 West_End_plays 18242373
2,tdb_0x000002,"""The Diary of Anne Frank"" was first published in English under what title?",The diary of a young girl,,Art and literature,,Question,False,tdb_0x000002,"""The Diary of Anne Frank"" was first published in English under what title?",0,[The diary of a young girl],ART_AND_LITERATURE,,,,tdb,[tdb_0x006650],"[(The Diary of Anne Frank, WORK_OF_ART), (first, ORDINAL), (English, LANGUAGE)]","[[The Diary of Anne Frank, 0], [title, 196676017]]","[Diary of Anne Frank, Diary of Anne Frank, title]",True,[SUCCESS],The_Diary_of_a_Young_Girl,1466910,pages.title pages.id intersection union \ 0 Personal_accounts_of_the_Holocaust 22222763 NaN NaN 1 World_War_II_memoirs 42752119 6.0 153.0 2 Memoirs_by_topic 37246118 1.0 98.0 3 Works_by_topic 21046246 0.0 197.0 4 Creative_works 7390225 0.0 209.0 5 Arts 4892515 0.0 120.0 6 Main_topic_classifications 7345184 0.0 116.0 jaccard depth similarityBAC mostSimilar \ 0 NaN 5 NaN NaN 1 0.039216 5 2.0 True 2 0.010204 4 3.5 True 3 0.000000 3 3.0 True 4 0.000000 2 2.0 True 5 0.000000 1 1.0 True 6 0.000000 0 0.0 False ...,pages.title pages.id 0 Dutch-language_books 54556063 1 Personal_accounts_of_the_Holocaust 22222763 2 CS1_Dutch-language_sources_(nl) 43966272 3 Forgery_controversies 36807531 4 CS1_Japanese-language_sources_(ja) 43966204 5 World_War_II_memoirs 42752119 6 Public_domain_books 9394833 7 Memory_of_the_World_Register 11945693 8 Books_published_posthumously 32947582 9 Anne_Frank 3020256 10 Jewish_literature 3687043 11 Books_relating_to_Anne_Frank 31369624 12 Dutch_literature 2685573 13 Books_adapted_into_films 24068918 14 Diaries 9519958 15 Doubleday_(publisher)_books 40590218


# Test

In [80]:
batchRuns = pd.read_csv("../workproduct-files/batchRuns.csv", delimiter=";")


In [48]:
batchRuns

Unnamed: 0,startIndex,stopIndex,startTime,endTime,runTime
0,-1,-1,,,
1,0,101,,,
2,102,130,,,


In [None]:
batchSize = 10

In [49]:
startIndex = batchRuns.iloc[-1,1] + 1
startIndex

131

In [82]:
useTest = t_data.loc[startIndex : startIndex+batchSize , "searchTerms"]

In [83]:
useTest

131                                                                                 [tragedy]
132                                                           [Piggy, schoolboy, star, story]
133                               [fool the eye, illusion, rendering, painting, object, form]
134                                           [plaster, lime, technique, fresh, color, water]
135    [Europe, antiquity, rebirth, representation, perspective, style, space, form, subject]
136    [1940, quallitie, painting, paint, expression, movement, freedom, surface, act, value]
137                                          [Catcher in the Rye, Catcher in the Rye, author]
138                                      [Doctor Zhivago, novel, society, author, view, time]
139                                                                                  [writer]
140                                                                      [artist, art, world]
141                         [Shakespeare, plot, wife, king, 

In [None]:
startTime = time.gmtime()

In [75]:
time.mktime(ts2)-time.mktime(ts1)

22.0

In [53]:
ts1 = time.gmtime()

In [57]:
ts2 = time.gmtime()

In [76]:
difference = time.mktime(ts2)-time.mktime(ts1)
difference

22.0

In [58]:
print(time.strftime("%Y-%m-%d %H:%M:%S", ts1))
print(time.strftime("%Y-%m-%d %H:%M:%S", ts2))

2020-09-23 10:54:09
2020-09-23 10:54:31


In [None]:
testTerms1 = []
testTerms2 = ["asdfölakjhsg", "asdgiieieiei"]
testTerms3 = ["meaning", "wood"]

In [None]:
findQuestionCategories(testTerms1)

In [None]:
findQuestionCategories(testTerms2)

In [None]:
findQuestionCategories(testTerms3)

In [None]:
# NEXT: Batch runs
# Save correct version of t_data (with key words identified) to dedicated folder
# Create text file with index to start next batch (0)
# Run batch as .apply(lambda x: [TERMS].findQuestionCategories(x)) on n rows
# Save output (with indexes intact) as pickle in subfolder
# Update text file with index + n

# ITERATE FULL DATA SET

# Combine all files with rows
# Append result rows to t_data dataset

# Perform categorization based on all existing info