In [None]:
# ADD
# Batch run functionality
    # Take rows not yet processed, run in batches, save after each batch
# Improve cleaned format of final dataframe (column names and contents)
# Write final category guess rules

### Setup

In [1]:
import pandas as pd
from string import punctuation, digits
import re
import ast
import stopit
from py2neo import *

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

#### Keyword to article title mapping data

In [3]:
%%time
redirects = pd.read_csv("F:/wikipedia-data/outputs/redirect.csv")

Wall time: 10.3 s


In [4]:
%%time
articles = pd.read_csv("F:/wikipedia-data/outputs/articles.csv")

Wall time: 2min 11s


In [5]:
%%time
articles.dropna(subset = ["title"], inplace = True)
redirects.dropna(subset = ["title"], inplace = True)

Wall time: 13.3 s


In [6]:
%%time
articles["titleLower"] = articles["title"].apply(lambda x: x.lower())
redirects["titleLower"] = redirects["title"].apply(lambda x: x.lower())

Wall time: 5.07 s


#### Graph database connection

In [7]:
# Connect to neo4j database - start database separately
graph = Graph()

In [8]:
# Set timeout-limit in seconds for database calls
maxSearchTime = 45

#### Trivia data

In [9]:
%%time
t_data = pd.read_pickle("../workproduct-files/t_dataMaster-keywordsIdentified.pkl")

Wall time: 714 ms


### Functions

#### Search term to wikipedia article name linking

In [10]:
#Returns wikipedia article formatted for database search, if not found, returns FALSE
def inArticles(a):
    match = articles.loc[articles["titleLower"] == a.lower(), :]
    if len(match) > 0:
        return match.iloc[0, 1].replace(" ", "_")
    else:
        return False

In [11]:
#Returns wikipedia article formatted for database search, if not found, returns FALSE
def inRedirects(a):
    match = redirects.loc[redirects["titleLower"] == a.lower(), :]
    if len(match) > 0:
        return match.iloc[0, 2].replace(" ", "_")
    else:
        return False

In [12]:
#Get first link from article based on title (DB formatting). Return False if no links exist
def getFirstLink(a):
    #match will be a pandas series of len=1
    match = articles.loc[articles["title"] == a.replace("_", " "), "links"]
    
    if len(match) > 0:
        #Change first series value into list
        asList = ast.literal_eval(match.iloc[0]) 
        #result = asList[0].replace(" ", "_")
        result = asList[0]
        
        #Take string only until |
        result = re.sub("(\|)(.+)", '', result)
        result = re.sub("(\|)", '', result)
        
        return result
    else:
        return False

#### Wikipedia article name to neo4j database calls

In [13]:
# Call database for category tree and parents of given wikipedia title
@stopit.threading_timeoutable(default='Database call timed out (' + str(maxSearchTime) + ' seconds)')
def getCategoryInfo(a):
    # kill function if runs to long (>2min ?)
        # https://stackoverflow.com/questions/14920384/stop-code-after-time-period
        # https://pypi.org/project/stopit/#id14
    
    #result [wikipediID, path to MTC, parents]
    result = []
    
    try:
        articleID = articleByTitle(a).iloc[0,1]
        parents = parentCategories(articleID)
        
        if "Disambiguation_pages" in parents["pages.title"].values:
            firstLink = getFirstLink(a)
            return getWikipediaInfo(firstLink)
        else:
            path = chosenPathArticleToMTC(articleID)
        
        return [a, articleID, path, parents]
    
    except (IndexError, ValueError, TypeError, ClientError):
        return "Database call not successful (error)"
        
        

In [14]:
# Performs search functions from given search term --> Output from wikipedia database
def getWikipediaInfo(a):
    
    term = a.lower()
    
    out = inArticles(term)
    if out != False:
        return getCategoryInfo(out, timeout = maxSearchTime)
    
    out = inRedirects(term)
    if out != False:
        return getCategoryInfo(out, timeout = maxSearchTime)
    
    return "Search term not found"
        
    
    # if search term is in articles
        # Perform database search
        # Return (WikipediaID, Category tree, Parent categories)
    # else if search term is in redirects
        # Perform database search
        # Return (WikipediaID, Category tree, Parent categories)
    # else return FALSE

#### neo4j database calls

##### Return node info based on wikipedia id

In [15]:
def nodeInfo(a):
    commandToRun = 'MATCH (pages:Page {id: %s}) \
                RETURN pages' % (a)
    return graph.run(commandToRun).data()

##### Return similarity statistics for two sets (intersection, union, Jaccard coefficient)

In [16]:
# compute similarity statistics
def similarityStats(a,b):
    intSize = len(a.intersection(b))
    unionSize = len(a.union(b))
    
    if unionSize == 0:
        jaccard = 0
    else:
        jaccard = intSize / unionSize
    
    return (intSize, unionSize, jaccard)

##### Return identifying information of parent categories of chosen article or category as pandas dataframe

In [17]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentCategories(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Category:Page) \
                <-[:BELONGS_TO]- \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    # ensure that a dataframe with correct columns is returnd also when query is empty
    out = pd.DataFrame(columns = ["pages.title", "pages.id"])
    out = out.append(graph.run(commandToRun).to_data_frame())
    
    return out


##### Return identifying information of children (both category and article) of chosen category as pandas dataframe

In [18]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def childPages(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Page) \
                -[:BELONGS_TO]-> \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    # ensure that a dataframe with correct columns is returnd also when query is empty
    out = pd.DataFrame(columns = ["pages.title", "pages.id"])
    out = out.append(graph.run(commandToRun).to_data_frame())
    
    return out

##### Return dataframe with all articles (but not categories) with given title [only one result is expected]

In [19]:
# Give wikipedia title as string as function argument
# Will not work if article title contains "-character --> "ClientError". Escape fixes do not work, not worth debugging.
def articleByTitle(a):
    ArticleToFind = a
    commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                    WHERE NONE(art IN [articles] WHERE art:Category) \
                    RETURN articles.title, articles.id, ID(articles)' % (ArticleToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe with all categories (but not articles) with given title [only one result is expected]

In [20]:
# Give wikipedia title as string as function argument
def categoryByTitle(a):
    CategoryToFind = a
    commandToRun = 'MATCH (categories:Category:Page {title: "%s"}) \
                    RETURN categories.title, categories.id, ID(categories)' % (CategoryToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe containing shortest path between input node (article or category) and Main_topics_classification category

In [21]:
def shortestPathToMTC(a):
    # (:Page {id: 7345184}) is Main_topics_classifications category node
    inputNode = a

    commandToRun = 'MATCH path=shortestPath( \
                    (:Page {id: %s})-[:BELONGS_TO*0..10]->(:Page {id: 7345184})) \
                    UNWIND nodes(path) AS pages \
                    RETURN pages.title, pages.id, ID(pages)' % (inputNode)

    return pd.DataFrame(graph.run(commandToRun).data())

##### Return similarity value between two categories as defined by Biuk-Aghai & Cheang (2011)

In [22]:
# Take as input tuple containing depth of category to compare to as well as intersection of children between two categories

'''
Given parent category p and child category c,
and given a root category node r, we calculate the category similarity
Sp;c as: Sp;c = Dc - Cp;c / k , where Dc is the depth of category
c in the category graph, i.e. the shortest distance from the root
category node r; Cp;c is the number of co-assigned articles of categories
p and c; and k is a constant that is empirically determined.
Through experimentation we have found that a value of k = 2 produces
the best results, i.e. results that agree with human intuition as
to similarity of a given pair of categories. A smaller value of Sp;c
indicates a greater similarity (i.e. a smaller distance between the
nodes). The number of co-assigned articles Cp;c of parent category
p and child category c is simply the cardinality of the intersection
of their assigned article sets: Cp;c = jAp \ Acj, where Ap and Ac
are the sets of articles assigned to categories p and c, respectively.
'''
# Depth is calcualted for parent when going bottom-up in graph
# C is calculated using intersection of both child articles and sub-categories

def similarityBAC(a):
    d = a[0]
    c = a[1]
    k = 2
    return d - (c/k)

##### Return dataframe containing all parent categories of category a and similarity statistics to each as well as parent depth to MTC

In [23]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentSimilarities(a):
    parents = parentCategories(a)
    children = childPages(a)
    
    if len(parents) == 0:
        raise ValueError("Category processed does not have parents (likely input category to chosenPathUpToMTC() if called)")
    
    # Create columns with similarity stats using functions similarityStats
    parents["similarities"] = parents["pages.id"].apply(lambda x: similarityStats(set(children["pages.id"]), set(childPages(x)["pages.id"])))
    parents[["intersection", "union", "jaccard"]] = pd.DataFrame(parents['similarities'].tolist(), index = parents.index)
    parents.drop(["similarities"], axis = 1, inplace = True)
        
    # Add column with parent category depth (steps to Main_topics_classifications node)
    parents["depth"] = parents["pages.id"].apply(lambda x: len(shortestPathToMTC(x))-1)
    
    # Add column with similarityBAC
    parents["similarityBAC-aid"] = list(zip(parents["depth"], parents["intersection"]))
    parents["similarityBAC"] = parents["similarityBAC-aid"].apply(lambda x: similarityBAC(x))
    parents.drop(["similarityBAC-aid"], axis = 1, inplace = True)
    
    # Sort ascending
    parents.sort_values(by = "similarityBAC", ascending = True, inplace = True)
    parents.reset_index(drop = True, inplace = True)
    
    return parents

##### Return node based on neo4j database ID [NOTE: not same as wikipedia ID used elsewhere]

In [24]:
def getWithNeoID(a):
    return NodeMatcher(graph).get(a)

##### Return dataframe containing info of what category to choose from parentSimilarities() output

In [25]:
'''
we choose which parent link to keep according to
following rules: (1) Choose the parent whose similarity value Sp;c
is lower; (2) If Sp1;c = Sp2;c, choose the parent whose depth D is
lower; (3) If Dp1 = Dp2, choose the parent with the larger value
of Cp;c; (4) If Cp1;c = Cp2;c, choose the parent with the lower
page ID.
'''
# Takes parentSimilarities() / or potentially child similarities output dataframe as input
def chooseCategoryPath(a):
    a.sort_values(by = ["similarityBAC", "depth"], ascending = True, inplace = True)
    a["mostSimilar"] = "False"
    a["comment"] = ""
    
    # Set value for mostSimilar to "Not connected" for rows with depth = -1 i.e. no connection to MTC
    a.loc[a["depth"] == -1, "comment"] = "Not connected"
    
    # Set value for mostSimilar to "True" for rows that are not "Not connected" and that have the minimum value of similarityBAC
    workingDF = a.loc[a["comment"] != "Not connected"]
    selectedIndexes = workingDF.loc[workingDF["similarityBAC"] == workingDF["similarityBAC"].min()].index
    
    a.loc[selectedIndexes, "mostSimilar"] = "True"
    a.loc[selectedIndexes, "comment"] = "Lowest similarityBAC"
    
    workingDF = a.loc[a["mostSimilar"] == "True"]
    
    if len(workingDF) > 1:
        # Set all mostSimilar of partial dataframe and output back to False, then set min depth rows to true
        workingDF["mostSimilar"] = "False"
        a["mostSimilar"] = "False"
        selectedIndexes = workingDF.loc[workingDF["depth"] == workingDF["depth"].min()].index
        
        a.loc[selectedIndexes, "mostSimilar"] = "True"
        a.loc[selectedIndexes, "comment"] = a.loc[selectedIndexes, "comment"] + "; Lowest depth"
        
        workingDF = a.loc[a["mostSimilar"] == "True"]
        
        # If several rows now set to true, test for highest intersection
        if len(workingDF) > 1:
            # Set all mostSimilar of partial dataframe and output back to False, then set max intersection rows to true
            workingDF["mostSimilar"] = "False"
            a["mostSimilar"] = "False"
            selectedIndexes = workingDF.loc[workingDF["intersection"] == workingDF["intersection"].max()].index
            
            a.loc[selectedIndexes, "mostSimilar"] = "True"
            a.loc[selectedIndexes, "comment"] = a.loc[selectedIndexes, "comment"] + "; Highest intersection"
            
            workingDF = a.loc[a["mostSimilar"] == "True"]
            
            # If several rows now set to true, choose row with lowes pages.id
            if len(workingDF) > 1:
                # Set all mostSimilar of partial dataframe and output back to False, then set min wikipedia id row (only one) to true
                workingDF["mostSimilar"] = "False"
                a["mostSimilar"] = "False"
                selectedIndexes = workingDF.loc[workingDF["pages.id"] == workingDF["pages.id"].min()].index
                
                a.loc[selectedIndexes, "mostSimilar"] = "True"
                a.loc[selectedIndexes, "comment"] = a.loc[selectedIndexes, "comment"] + "; Lowest wikipedia id"
    
    
    return a

##### Return dataframe containing info of chosen path to MTC (iterates chooseCategoryPath() upwards)

In [26]:
# Iterate chooseCategoryPath() from input category (wikipedia id as input) until MTC is reached. Return dataframe with chosen path rows
# Root node category "Main_topic_classifications" has pages.id = 7345184

# NOTE: Error if input category does not have parents
def chosenPathUpToMTC(a):
    mtcFound = False
    nextStep = a
    chosenPath = pd.DataFrame()
    
    while(not mtcFound):
        allParents = parentSimilarities(nextStep)
        allParents = chooseCategoryPath(allParents)
        
        # If allParents contains MTC category
        if(len(allParents.loc[allParents["pages.id"] == 7345184]) == 1):
            rowToAppend = allParents.loc[allParents["pages.id"] == 7345184]
            mtcFound = True
        else:
            rowToAppend = allParents.loc[allParents["mostSimilar"] == "True"]
            nextStep = int(allParents.loc[allParents["mostSimilar"] == "True", "pages.id"])
        
        chosenPath = chosenPath.append(rowToAppend)
        chosenPath.reset_index(drop = True, inplace = True)
    
    
    return chosenPath

##### Article strength calculations

In [27]:
# Return dataframe with all pages linking to or from input page
def linksBetween(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Page) \
                -[:LINKS_TO]- \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    # ensure that a dataframe with correct columns is returnd also when query is empty
    out = pd.DataFrame(columns = ["pages.title", "pages.id"])
    out = out.append(graph.run(commandToRun).to_data_frame())
    
    return out


In [28]:
# a as pages.id for artice, c as pages.id for parent category
def articleClassificationStrength(a, c):
    aLinks = set(linksBetween(a)["pages.id"])
    cChildren = set(childPages(c)["pages.id"])
    
    intersectionSize = len(aLinks.intersection(cChildren))
    
    return 1 + intersectionSize

In [29]:
def strongestArticleParents(a):
    parents = parentCategories(a)
    parents["depth"] = parents["pages.id"].apply(lambda x: len(shortestPathToMTC(x)) -1 )
    parents.loc[parents["depth"] != -1 , "Strength"] = parents["pages.id"].apply(lambda x: articleClassificationStrength(a, x))
    parents.sort_values(by = ["Strength"], ascending = False, inplace = True)
   
    
    return parents

In [30]:
def chosenPathArticleToMTC(a):
    strongestParent = strongestArticleParents(a)
    path = chosenPathUpToMTC(strongestParent.iloc[0,1])
    
    path.loc[-1] = strongestParent.iloc[0, :3]
    path.sort_index(inplace = True)
    path.reset_index(drop = True, inplace = True)
    
    return path

# Test - Run "question to wikipedia category" analyses

In [None]:
# Fix functionality in findQuestionCategories
    # Save error messages
# Write batch processing


In [112]:
# Take list with search terms
# Run getWikipediaInfo() until search term works
# If result is False (search term not found) or "Database call not successful (error)" (search term found but path to MTC not available)
    # --> continue to next search term
# Return [search term used, result]

def findQuestionCategories(a):
    result = []
    for term in a:
        termResult = getWikipediaInfo(term)
        possibleFailureMessages = ('Search term not found', 'Database call timed out (' + str(maxSearchTime) + ' seconds)', 'Database call not successful (error)')
        
        result.append((term, termResult))
        if termResult not in possibleFailureMessages:
            break
    
    if len(result) == 0:
        result.append(("NO SEARCH TERMS GIVEN"))
            
    return result
       
        

In [101]:
%%time
a = findQuestionCategories(["aslökdfhjaölskjdhf", "sanna marin", "Ben Hur"])

Wall time: 50.7 s


In [111]:
a[-1][1]

['Ben-Hur:_A_Tale_of_the_Christ',
 1343296,
                   pages.title  pages.id  intersection  union  jaccard  depth  \
 0                     Ben-Hur  45418958           NaN    NaN      NaN      3   
 1            Media_franchises   1170420           0.0  706.0      0.0      2   
 2               Entertainment    693016           0.0  826.0      0.0      1   
 3  Main_topic_classifications   7345184           0.0  148.0      0.0      0   
 
    similarityBAC mostSimilar               comment  
 0            NaN         NaN                   NaN  
 1            2.0        True  Lowest similarityBAC  
 2            1.0        True  Lowest similarityBAC  
 3            0.0       False                        ,
                                                     pages.title  pages.id
 0                                       Harper_&_Brothers_books  37833610
 1                            American_novels_adapted_into_films  28078409
 2              Pages_using_citations_with_accessdate

In [None]:
# Split successful search from non-succesfull (two columns)
# Split successful into separate columns (search term, article title, articleID, path, parents)

In [32]:
def explodeCatData(a, b):
    if len(a) >= b+1:
        return a[b]

In [42]:
use = t_data.iloc[9::600]

In [43]:
len(use)

108

In [44]:
%%time
# Run time for 7 rows: 38,2 seconds
# Run time for 65 rows: 7min 31s
# Run time for 108 rows: 13min 49s
use["wikipediaOutput"] = use["searchTerms"].apply(lambda x: findQuestionCategories(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workingDF["mostSimilar"] = "False"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workingDF["mostSimilar"] = "False"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workingDF["mostSimilar"] = "False"


Wall time: 13min 49s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [45]:
use["searchTermUsed"] = use["wikipediaOutput"].apply(lambda x: explodeCatData(x, 0))
use["wikipediaArticleTitle"] = use["wikipediaOutput"].apply(lambda x: explodeCatData(x, 1))
use["wikipediaArticleID"] = use["wikipediaOutput"].apply(lambda x: explodeCatData(x, 2))
use["wikipediaCategoryInfo"] = use["wikipediaOutput"].apply(lambda x: explodeCatData(x, 3))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  use["searchTermUsed"] = use["wikipediaOutput"].apply(lambda x: explodeCatData(x, 0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  use["wikipediaArticleTitle"] = use["wikipediaOutput"].apply(lambda x: explodeCatData(x, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  use["wikipediaArticleID"] = 

In [46]:
use["nextToMtcCategory"] = use["wikipediaCategoryInfo"].apply(lambda x: x.iloc[-2,0] if x is not None else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  use["impliedCategory"] = use["wikipediaCategoryInfo"].apply(lambda x: x.iloc[-2,0] if x is not None else None)


In [47]:
use.drop(columns=["wikipediaOutput"], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [48]:
use.to_pickle("../workproduct-files/TEST-categorySearch.pkl")

In [54]:
use.columns

Index(['CONS_id', 'CONS_question', 'CONS_answer', 'CONS_alt answers',
       'CONS_category', 'CONS_alt categories - NOT USED',
       'CONS_type-formulation', 'CONS_type-multipleChoice', 'ORIG_id',
       'ORIG_question', 'ORIG_answer', 'ORIG_alt answers', 'ORIG_category',
       'ORIG_alt categories', 'ORIG_difficulty', 'ORIG_type', 'Source',
       'Duplicate_removed', 'namedEntities', 'nouns', 'searchTerms',
       'searchTermUsed', 'wikipediaArticleTitle', 'wikipediaArticleID',
       'wikipediaCategoryInfo', 'impliedCategory'],
      dtype='object')

In [61]:
#use.loc[:, ["CONS_question", "CONS_answer", "CONS_category", "impliedCategory"]]
use.loc[:,["CONS_question", "CONS_answer", "CONS_category", "impliedCategory", 'namedEntities', "searchTerms", 'searchTermUsed', 'wikipediaCategoryInfo']]

Unnamed: 0,CONS_question,CONS_answer,CONS_category,impliedCategory,namedEntities,searchTerms,searchTermUsed,wikipediaCategoryInfo
9,"A European style developed in France in the late eleventh century. Its sculpture is ornamental, stylized and complex.",Romanesque,Art and literature,Arts,"[(European, NORP), (France, GPE), (the late eleventh century, DATE)]","[sculpture, complex, century, style]",sculpture,pages.title pages.id intersection union jaccard depth \ 0 Sculpture_techniques 2380541 NaN NaN NaN 4 1 Artistic_techniques_by_art 50520052 0.0 57.0 0.0 3 2 Artistic_techniques 891248 0.0 183.0 0.0 2 3 Arts 4892515 0.0 270.0 0.0 1 4 Main_topic_classifications 7345184 0.0 116.0 0.0 0 similarityBAC mostSimilar comment 0 NaN NaN NaN 1 3.0 True Lowest similarityBAC 2 2.0 True Lowest similarityBAC 3 1.0 True Lowest similarityBAC 4 0.0 False
609,In this 1968 film the husband of an unsuspecting young wife becomes involved with a witch's coven.,Rosemary's Baby,Entertainment,Religion,"[(1968, DATE)]","[coven, witch, husband, wife, film]",coven,pages.title pages.id intersection union jaccard \ 0 Witchcraft 774158 NaN NaN NaN 1 Magic_(paranormal) 16567661 17.0 344.0 0.049419 2 Occult 956011 6.0 228.0 0.026316 3 Esotericism 2255400 21.0 177.0 0.118644 4 Spiritualism 7114272 4.0 152.0 0.026316 5 Religious_belief_and_doctrine 20813658 1.0 125.0 0.008000 6 Religion 692694 0.0 161.0 0.000000 7 Main_topic_classifications 7345184 0.0 110.0 0.000000 depth similarityBAC mostSimilar comment 0 4 NaN NaN NaN 1 3 -5.5 True Lowest similarityBAC 2 3 0.0 True Lowest similarityBAC 3 ...
1209,Who's first release was 'Talking Heads 77'?,Psycho Killer,Entertainment,,"[(first, ORDINAL)]",[s first release was ],NO WIKIPEDIA LINK FOUND,
1809,What Is The Most Expensive Spice?,Saffron,Sports and leisure,,[],[spice],NO WIKIPEDIA LINK FOUND,
2409,In which ocean or sea are the Seychelles?,Indian Ocean,Geography and places,Nature,[],"[ocean, sea]",ocean,pages.title pages.id intersection union jaccard depth \ 0 Oceanography 716910 NaN NaN NaN 4 1 Physical_geography 2153059 5.0 442.0 0.011312 3 2 Earth_sciences 716903 11.0 256.0 0.042969 3 3 Earth 754965 2.0 182.0 0.010989 2 4 Nature 696603 0.0 102.0 0.000000 1 5 Main_topic_classifications 7345184 1.0 57.0 0.017544 0 similarityBAC mostSimilar comment 0 NaN NaN NaN 1 0.5 True Lowest similarityBAC; Lowest depth 2 -2.5 True Lowest similarityBAC 3 1.0 True Lowest similarityBAC 4 1.0 True Lowest similarityBAC 5 -0.5 True ...
3009,What is the basic unit of currency for Saudi Arabia?,Riyal,Geography and places,Society,"[(Saudi Arabia, GPE)]","[currency, unit]",currency,pages.title pages.id intersection union jaccard depth \ 0 Foreign_exchange_market 1655020 NaN NaN NaN 6 1 International_finance 5138124 9.0 209.0 0.043062 5 2 International_business 3236392 2.0 126.0 0.015873 4 3 Business 771152 0.0 93.0 0.000000 3 4 Economy 47397287 0.0 67.0 0.000000 2 5 Society 1633936 0.0 60.0 0.000000 1 6 Main_topic_classifications 7345184 0.0 62.0 0.000000 0 similarityBAC mostSimilar \ 0 NaN NaN 1 0.5 True 2 3.0 True 3 3.0 True 4 2.0 True 5 1.0 True 6 0.0 True comment 0 ...
3609,Which element makes up 2.83% of the earth's crust?,Sodium,Geography and places,Nature,"[(2.83%, PERCENT), (earth, LOC)]","[earth, crust, element, earth]",earth,pages.title pages.id intersection union jaccard depth \ 0 Earth 754965 NaN NaN NaN 2 1 Nature 696603 0.0 102.0 0.000000 1 2 Main_topic_classifications 7345184 1.0 57.0 0.017544 0 similarityBAC mostSimilar comment 0 NaN NaN NaN 1 1.0 True Lowest similarityBAC 2 -0.5 True Lowest similarityBAC
4209,In which 1947 Christmas film do a lovestruck couple pretend that the snowman is Parson Brow?,Winter Wonderland,History and society,Humanities,"[(1947, DATE), (Christmas, DATE), (Parson Brow, PERSON)]","[Parson Brow, snowman, pretend, couple, film]",snowman,pages.title pages.id intersection union jaccard depth \ 0 Christmas_characters 1051624 NaN NaN NaN 5 1 Holiday_characters 9651111 5.0 104.0 0.048077 4 2 Legendary_creatures 706923 0.0 65.0 0.000000 3 3 Folklore 728925 2.0 222.0 0.009009 2 4 Traditional_stories 694908 13.0 227.0 0.057269 3 5 Traditions 696248 2.0 128.0 0.015625 2 6 Cultural_anthropology 715355 6.0 242.0 0.024793 2 7 Anthropology 694863 12.0 545.0 0.022018 2 8 Humanities 1004110 2.0 477.0 0.004193 1 9 Main_topic_classifications 7345184 6.0 100.0 0.060000 0 similarityBAC mostSimilar comment 0 NaN NaN ...
4809,What type of animal is a caribou?,A Reindeer,History and society,Nature,[],"[caribou, animal, type]",caribou,pages.title pages.id intersection union jaccard \ 0 IUCN_Red_List_vulnerable_species 19281764 NaN NaN NaN 1 Vulnerable_species 7899403 0.0 8305.0 0.000000 2 Biota_by_conservation_status 19281484 1.0 45.0 0.022222 3 Environmental_conservation 35366151 9.0 156.0 0.057692 4 Conservation 737124 0.0 140.0 0.000000 5 Ecology 691262 0.0 364.0 0.000000 6 Natural_environment 3103170 1.0 374.0 0.002674 7 Nature 696603 1.0 54.0 0.018519 8 Main_topic_classifications 7345184 1.0 57.0 0.017544 depth similarityBAC mostSimilar \ 0 6 NaN NaN 1 5 5.0 True 2 4 3.5 True 3 5 0.5 ...
5409,"Who Described Russia As (A Riddle Wrapped In A Mystery, Inside An Enigma)?",Winston Churchill,History and society,Entertainment,"[(Russia, GPE), (Enigma, PRODUCT)]","[enigma, riddle, mystery]",enigma,pages.title pages.id intersection union jaccard depth \ 0 Riddles 3884134 NaN NaN NaN 5 1 Puzzles 1013641 0.0 186.0 0.000000 4 2 Toys_by_type 28596601 0.0 157.0 0.000000 3 3 Toys 778323 1.0 53.0 0.018868 2 4 Entertainment 693016 0.0 150.0 0.000000 1 5 Main_topic_classifications 7345184 0.0 148.0 0.000000 0 similarityBAC mostSimilar comment 0 NaN NaN NaN 1 4.0 True Lowest similarityBAC 2 3.0 True Lowest similarityBAC 3 1.5 True Lowest similarityBAC 4 1.0 True Lowest similarityBAC 5 0.0 False


In [81]:
articleID = categoryByTitle("Dallas_Stars")
childPages(articleID.iloc[0,1])

Unnamed: 0,pages.title,pages.id
0,Dallas_Stars_minor_league_affiliates,44888329
1,Dallas_Stars_draft_picks,6786253
2,Minnesota_North_Stars,1291349
3,Dallas_Stars_templates,43251887
4,Dallas_Stars,73135
5,Dallas_Stars_broadcasters,24358568
6,Dallas_Stars_seasons,8599536
7,Dallas_Stars_lists,44913547
8,Dallas_Stars_personnel,40381465
9,Paul_Fioroni,40260405


In [91]:
%%time
getWikipediaInfo("Tutankhamun")

Wall time: 5.59 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workingDF["mostSimilar"] = "False"


['Tutankhamun',
 30437,
                    pages.title  pages.id  intersection  union   jaccard  \
 0                Amarna_Period   2285457           NaN    NaN       NaN   
 1  Eighteenth_Dynasty_of_Egypt  35203613           0.0   94.0  0.000000   
 2   Dynasties_of_ancient_Egypt   7869122           1.0   98.0  0.010204   
 3                Ancient_Egypt    722461           1.0  161.0  0.006211   
 4                Civilizations   3255048           0.0  148.0  0.000000   
 5        Cultural_anthropology    715355           2.0  222.0  0.009009   
 6                 Anthropology    694863          12.0  545.0  0.022018   
 7                   Humanities   1004110           2.0  477.0  0.004193   
 8   Main_topic_classifications   7345184           6.0  100.0  0.060000   
 
    depth  similarityBAC mostSimilar  \
 0      7            NaN         NaN   
 1      6            6.0        True   
 2      5            4.5        True   
 3      4            3.5        True   
 4      3     

In [78]:
articleByTitle("Dallas_Stars")

Unnamed: 0,articles.title,articles.id,ID(articles)
0,Dallas_Stars,73135,199548


In [86]:
categoryByTitle("New_York")

Unnamed: 0,categories.title,categories.id,ID(categories)
0,New_York,54615728,7336710


In [96]:
chosenPathUpToMTC(19022484)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workingDF["mostSimilar"] = "False"


Unnamed: 0,pages.title,pages.id,intersection,union,jaccard,depth,similarityBAC,mostSimilar,comment
0,States_by_power_status,12318463,4,82,0.04878,3,1.0,True,Lowest similarityBAC
1,Independence,40105099,6,58,0.103448,3,0.0,True,Lowest similarityBAC; Lowest depth
2,Sovereignty,33542662,8,118,0.067797,4,0.0,True,Lowest similarityBAC
3,International_law,700660,15,418,0.035885,3,-4.5,True,Lowest similarityBAC
4,International_relations,918476,10,568,0.017606,4,-1.0,True,Lowest similarityBAC
5,Foreign_policy,19025423,11,280,0.039286,4,-1.5,True,Lowest similarityBAC
6,Public_policy,19025271,3,122,0.02459,3,1.5,True,Lowest similarityBAC
7,Government,697609,3,231,0.012987,2,0.5,True,Lowest similarityBAC; Lowest depth
8,Politics,695027,3,234,0.012821,1,-0.5,True,Lowest similarityBAC
9,Main_topic_classifications,7345184,0,115,0.0,0,0.0,False,


In [93]:
parentCategories(722461)

Unnamed: 0,pages.title,pages.id
0,Superpowers,19022484
1,Ancient_Africa,41255972
2,Iron_Age_countries,55467935
3,Civilizations,3255048
4,History_of_Egypt_by_period,9444719
5,Bronze_Age_countries,55467948
6,African_civilizations,11277046
7,Ancient_history_of_North_Africa,48859080
8,Ancient_history_by_country,10803059


In [94]:
parentSimilarities(722461)

Unnamed: 0,pages.title,pages.id,intersection,union,jaccard,depth,similarityBAC
0,Civilizations,3255048,0,148,0.0,3,3.0
1,Superpowers,19022484,1,122,0.008197,4,3.5
2,History_of_Egypt_by_period,9444719,1,108,0.009259,4,3.5
3,African_civilizations,11277046,1,116,0.008621,4,3.5
4,Ancient_history_by_country,10803059,0,141,0.0,4,4.0
5,Ancient_Africa,41255972,0,96,0.0,5,5.0
6,Iron_Age_countries,55467935,0,88,0.0,5,5.0
7,Bronze_Age_countries,55467948,0,88,0.0,5,5.0
8,Ancient_history_of_North_Africa,48859080,0,101,0.0,5,5.0
