In [1]:
## Sequence
# Build database from file
# Add isCategory property
# Add Exclude label based on matching lists
# Drop nodes marked for exclusion
# Add Exclude-Belongs_to label unwanted rels (that should'nt be deleted totally) (e.g to node "People")
# Remove Belongs_to from above mentioned rels

In [2]:
# Next:
# Clean non-wanted nodes from graph (container categories etc.)
#    Try: https://neo4j.com/docs/graph-algorithms/current/projected-graph-model/cypher-projection/
# Write custom similarity calculation function
# Write iterator function

In [3]:
# NOTE
# cypher.forbid_exhaustive_shortestpath=true set in neo4j conf file
# https://neo4j.com/docs/operations-manual/current/configuration/neo4j-conf/

In [4]:
from py2neo import *
import pandas as pd

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [6]:
graph = Graph()

## FUNCTIONS

##### Return similarity statistics for two sets (intersection, union, Jaccard coefficient)

In [7]:
# compute similarity statistics
def similarityStats(a,b):
    intSize = len(a.intersection(b))
    unionSize = len(a.union(b))
    return (intSize, unionSize, intSize / unionSize)

##### Return identifying information of parent categories of chosen article or category as pandas dataframe

In [8]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentCategories(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Category:Page) \
                <-[:BELONGS_TO]- \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    return graph.run(commandToRun).to_data_frame()


##### Return identifying information of children (both category and article) of chosen category as pandas dataframe

In [9]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def childPages(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Page) \
                -[:BELONGS_TO]-> \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    return graph.run(commandToRun).to_data_frame()

##### Return dataframe with all articles (but not categories) with given title [only one result is expected]

In [10]:
# Give wikipedia title as string as function argument
def articleByTitle(a):
    ArticleToFind = a
    commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                    WHERE NONE(art IN [articles] WHERE art:Category) \
                    RETURN articles.title, articles.id, ID(articles)' % (ArticleToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe with all categories (but not articles) with given title [only one result is expected]

In [11]:
# Give wikipedia title as string as function argument
def categoryByTitle(a):
    CategoryToFind = a
    commandToRun = 'MATCH (categories:Category:Page {title: "%s"}) \
                    RETURN categories.title, categories.id, ID(categories)' % (CategoryToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe containing shortest path between input node (article or category) and Main_topics_classification category

In [12]:
def shortestPathToMTC(a):
    # (:Page {id: 7345184}) is Main_topics_classifications category node
    inputNode = a

    commandToRun = 'MATCH path=shortestPath( \
                    (:Page {id: %s})-[:BELONGS_TO*0..10]->(:Page {id: 7345184})) \
                    UNWIND nodes(path) AS pages \
                    RETURN pages.title, pages.id, ID(pages)' % (inputNode)

    return pd.DataFrame(graph.run(commandToRun).data())

##### Return similarity value between two categories as defined by Biuk-Aghai & Cheang (2011)

In [13]:
# Take as input tuple containing depth of category to compare to as well as intersection of children between two categories

'''
Given parent category p and child category c,
and given a root category node r, we calculate the category similarity
Sp;c as: Sp;c = Dc - Cp;c / k , where Dc is the depth of category
c in the category graph, i.e. the shortest distance from the root
category node r; Cp;c is the number of co-assigned articles of categories
p and c; and k is a constant that is empirically determined.
Through experimentation we have found that a value of k = 2 produces
the best results, i.e. results that agree with human intuition as
to similarity of a given pair of categories. A smaller value of Sp;c
indicates a greater similarity (i.e. a smaller distance between the
nodes). The number of co-assigned articles Cp;c of parent category
p and child category c is simply the cardinality of the intersection
of their assigned article sets: Cp;c = jAp \ Acj, where Ap and Ac
are the sets of articles assigned to categories p and c, respectively.
'''
# Depth is calcualted for parent when going bottom-up in graph
# C is calculated using intersection of both child articles and sub-categories

def similarityBAC(a):
    d = a[0]
    c = a[1]
    k = 2
    return d - (c/k)

##### Return dataframe containing all parent categories of category a and similarity statistics to each as well as parent depth to MTC

In [14]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentSimilarities(a):
    parents = parentCategories(a)
    children = childPages(a)
    
    # Create columns with similarity stats using functions similarityStats
    parents["similarities"] = parents["pages.id"].apply(lambda x: similarityStats(set(children["pages.id"]), set(childPages(x)["pages.id"])))
    parents[["intersection", "union", "jaccard"]] = pd.DataFrame(parents['similarities'].tolist(), index = parents.index)
    parents.drop(["similarities"], axis = 1, inplace = True)
        
    # Add column with parent category depth (steps to Main_topics_classifications node)
    parents["depth"] = parents["pages.id"].apply(lambda x: len(shortestPathToMTC(x))-1)
    
    # Add column with similarityBAC
    parents["similarityBAC-aid"] = list(zip(parents["depth"], parents["intersection"]))
    parents["similarityBAC"] = parents["similarityBAC-aid"].apply(lambda x: similarityBAC(x))
    parents.drop(["similarityBAC-aid"], axis = 1, inplace = True)
    
    # Sort ascending
    parents.sort_values(by = "similarityBAC", ascending = True, inplace = True)
    parents.reset_index(drop = True, inplace = True)
    
    return parents

##### Return node based on neo4j database ID [NOTE: not same as wikipedia ID used elsewhere]

In [15]:
def getWithNeoID(a):
    return NodeMatcher(graph).get(a)

##### Return dataframe containing info of what category to choose from parentSimilarities() output

In [16]:
'''
we choose which parent link to keep according to
following rules: (1) Choose the parent whose similarity value Sp;c
is lower; (2) If Sp1;c = Sp2;c, choose the parent whose depth D is
lower; (3) If Dp1 = Dp2, choose the parent with the larger value
of Cp;c; (4) If Cp1;c = Cp2;c, choose the parent with the lower
page ID.
'''
# Takes parentSimilarities() / or potentially child similarities output dataframe as input
def chooseCategoryPath(a):
    a.sort_values(by = ["similarityBAC", "depth"], ascending = True, inplace = True)
    a["mostSimilar"] = "False"
    a["comment"] = ""
    
    # Set value for mostSimilar to "Not connected" for rows with depth = -1 i.e. no connection to MTC
    a.loc[a["depth"] == -1, "comment"] = "Not connected"
    
    # Set value for mostSimilar to "True" for rows that are not "Not connected" and that have the minimum value of similarityBAC
    workingDF = a.loc[a["comment"] != "Not connected"]
    selectedIndexes = workingDF.loc[workingDF["similarityBAC"] == workingDF["similarityBAC"].min()].index
    
    a.loc[selectedIndexes, "mostSimilar"] = "True"
    a.loc[selectedIndexes, "comment"] = "Lowest similarityBAC"
    
    workingDF = a.loc[a["mostSimilar"] == "True"]
    
    if len(workingDF) > 1:
        # Set all mostSimilar of partial dataframe and output back to False, then set min depth rows to true
        workingDF["mostSimilar"] = "False"
        a["mostSimilar"] = "False"
        selectedIndexes = workingDF.loc[workingDF["depth"] == workingDF["depth"].min()].index
        
        a.loc[selectedIndexes, "mostSimilar"] = "True"
        a.loc[selectedIndexes, "comment"] = a.loc[selectedIndexes, "comment"] + "; Lowest depth"
        
        workingDF = a.loc[a["mostSimilar"] == "True"]
        
        # If several rows now set to true, test for highest intersection
        if len(workingDF) > 1:
            # Set all mostSimilar of partial dataframe and output back to False, then set max intersection rows to true
            workingDF["mostSimilar"] = "False"
            a["mostSimilar"] = "False"
            selectedIndexes = workingDF.loc[workingDF["intersection"] == workingDF["intersection"].max()].index
            
            a.loc[selectedIndexes, "mostSimilar"] = "True"
            a.loc[selectedIndexes, "comment"] = a.loc[selectedIndexes, "comment"] + "; Highest intersection"
            
            workingDF = a.loc[a["mostSimilar"] == "True"]
            
            # If several rows now set to true, choose row with lowes pages.id
            if len(workingDF) > 1:
                # Set all mostSimilar of partial dataframe and output back to False, then set min wikipedia id row (only one) to true
                workingDF["mostSimilar"] = "False"
                a["mostSimilar"] = "False"
                selectedIndexes = workingDF.loc[workingDF["pages.id"] == workingDF["pages.id"].min()].index
                
                a.loc[selectedIndexes, "mostSimilar"] = "True"
                a.loc[selectedIndexes, "comment"] = a.loc[selectedIndexes, "comment"] + "; Lowest wikipedia id"
    
    
    return a

## Adding node labels and properties

#### Database edits (run commands commented out to prevent accidental runs)

In [17]:
# 1. Add isCategory property to each node (1 / 0)
#¨2. Drop nodes to exclude

In [18]:
%%time
# Add pageType property with value "Category" to all category pages

commandToRun = 'MATCH (pages:Category:Page) \
                SET pages.pageType = "Category"'

#graph.run(commandToRun)

Wall time: 0 ns


In [19]:
%%time
# Add pageType property with value "Article" to all article pages

commandToRun = 'MATCH (pages:Page) \
                WHERE NONE(art IN [pages] WHERE art:Category) \
                SET pages.pageType = "Article"'

#graph.run(commandToRun)

Wall time: 0 ns


In [20]:
%%time
# Remove pageType property from category pages
# NOTE!! Run out of allocated memory when run on all nodes (cat + art)

commandToRun = "MATCH (pages:Category:Page) \
                REMOVE pages.pageType"

#graph.run(commandToRun)

Wall time: 0 ns


In [21]:
%%time
# Remove pageType property from article pages

commandToRun = "MATCH (pages:Page) \
                WHERE NONE(art IN [pages] WHERE art:Category) \
                REMOVE pages.pageType"

#graph.run(commandToRun)

Wall time: 0 ns


In [22]:
%%time
# Add isCategory property with value 1 to all category pages

commandToRun = 'MATCH (pages:Category:Page) \
                SET pages.isCategory = 1'

#graph.run(commandToRun)

Wall time: 0 ns


In [23]:
%%time
# Add isCategory property with value 0 to all article pages

commandToRun = 'MATCH (pages:Page) \
                WHERE NONE(art IN [pages] WHERE art:Category) \
                SET pages.isCategory = 0'

#graph.run(commandToRun)

Wall time: 0 ns


In [24]:
%%time
# Add label "Include" to all pages (Wall time: 24min 41s)

commandToRun = "MATCH (pages:Page) \
                SET pages:Include"

#graph.run(commandToRun)

Wall time: 0 ns


In [25]:
%%time
# Remove label "Include" from pages included in list of categories to exclude (Wall time: 1min 33s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title STARTS WITH 'Wikipedia_' \
                OR pages.title STARTS WITH '1' \
                OR pages.title STARTS WITH '2' \
                OR pages.title STARTS WITH '3' \
                OR pages.title STARTS WITH '4' \
                OR pages.title STARTS WITH '5' \
                OR pages.title STARTS WITH '6' \
                OR pages.title STARTS WITH '7' \
                OR pages.title STARTS WITH '8' \
                OR pages.title STARTS WITH '9' \
                OR pages.title STARTS WITH '0' \
                OR pages.title STARTS WITH 'List_of' \
                OR pages.title STARTS WITH 'All_articles' \
                OR pages.title STARTS WITH 'Articles_' \
                OR pages.title CONTAINS 'by_year' \
                OR pages.title CONTAINS 'of_the_year' \
                OR pages.title CONTAINS '_in_' \
                REMOVE pages:Include"

#graph.run(commandToRun)

Wall time: 0 ns


In [26]:
%%time
# Add label "Exclude" to all pages to exclude (Wall time: 1min 31s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title STARTS WITH 'Wikipedia_' \
                OR pages.title STARTS WITH '1' \
                OR pages.title STARTS WITH '2' \
                OR pages.title STARTS WITH '3' \
                OR pages.title STARTS WITH '4' \
                OR pages.title STARTS WITH '5' \
                OR pages.title STARTS WITH '6' \
                OR pages.title STARTS WITH '7' \
                OR pages.title STARTS WITH '8' \
                OR pages.title STARTS WITH '9' \
                OR pages.title STARTS WITH '0' \
                OR pages.title STARTS WITH 'List_of' \
                OR pages.title STARTS WITH 'All_articles' \
                OR pages.title STARTS WITH 'Articles_' \
                OR pages.title CONTAINS 'by_year' \
                OR pages.title CONTAINS 'of_the_year' \
                OR pages.title CONTAINS '_in_' \
                SET pages:Exclude"

#graph.run(commandToRun)

Wall time: 0 ns


In [27]:
%%time
# Add label "Exclude" to all pages to exclude (Wall time: 6.69 s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title CONTAINS '_categories' \
                OR pages.title = 'Webarchive_template_wayback_links' \
                SET pages:Exclude"

#graph.run(commandToRun)

Wall time: 0 ns


In [28]:
%%time
# DETACH and DELETE all nodes with label Exclude, iterate using APOC (Wall time: 39min 21s)
# https://neo4j.com/developer/kb/large-delete-transaction-best-practices-in-neo4j/

commandToRun = "CALL apoc.periodic.iterate('MATCH (pages:Exclude) \
                RETURN pages', \
                'DETACH DELETE pages', \
                {batchSize:1000}) \
                YIELD batches, total \
                RETURN batches, total"

#graph.run(commandToRun)


Wall time: 0 ns


## Step B
##### Calculate similarity between two categories (Note: not an article and a category)

In [None]:
# Next: write iteration bottom-up
# Write breadth first traversal top-down

In [36]:
aID = articleByTitle("The_Legend_of_Zelda").iloc[0,1]
parentCategories(aID)

Unnamed: 0,pages.title,pages.id
0,CS1_Japanese-language_sources_(ja),43966204
1,Video_game_franchises,9251765
2,Action-adventure_games,722690
3,Video_games_produced_by_Shigeru_Miyamoto,36086819
4,CS1_Swedish-language_sources_(sv),43966316
5,CS1_Portuguese-language_sources_(pt),43966290
6,Use_mdy_dates_from_February_2018,56454802
7,Nintendo_franchises,2678276
8,Action-adventure_video_games_by_series,57239507
9,The_Legend_of_Zelda,33893326


In [29]:
categoryByTitle('Video_games')

Unnamed: 0,categories.title,categories.id,ID(categories)
0,Video_games,3291649,5849882


In [31]:
startCategory = 53895843
a = parentSimilarities(startCategory)
test = chooseCategoryPath(a)
test

Unnamed: 0,pages.title,pages.id,intersection,union,jaccard,depth,similarityBAC,mostSimilar,comment
0,Gaming,11608552,0,71,0.0,2,2.0,True,Lowest similarityBAC
1,Video,7513737,0,117,0.0,4,4.0,False,
2,Computing,700305,0,95,0.0,5,5.0,False,


In [33]:
# Iterate chooseCategoryPath() from input category (wikipedia id as input) until MTC is reached. Return dataframe with chosen path rows
# Root node category "Main_topic_classifications" has pages.id = 7345184
def chosenPathUpToMTC(a):
    mtcFound = False
    nextStep = a
    chosenPath = pd.DataFrame()
    
    while(not mtcFound):
        allParents = parentSimilarities(nextStep)
        allParents = chooseCategoryPath(allParents)
        
        # If allParents contains MTC category
        if(len(allParents.loc[allParents["pages.id"] == 7345184]) == 1):
            rowToAppend = allParents.loc[allParents["pages.id"] == 7345184]
            mtcFound = True
        else:
            rowToAppend = allParents.loc[allParents["mostSimilar"] == "True"]
            nextStep = int(allParents.loc[allParents["mostSimilar"] == "True", "pages.id"])
        
        chosenPath = chosenPath.append(rowToAppend)
        chosenPath.reset_index(drop = True, inplace = True)
    
    return chosenPath

In [44]:
%%time
chosenPathUpToMTC(categoryByTitle('Literature').iloc[0,1])

Wall time: 196 ms


Unnamed: 0,pages.title,pages.id,intersection,union,jaccard,depth,similarityBAC,mostSimilar,comment
0,Narratology,866705,6,310,0.019355,2,-1.0,True,Lowest similarityBAC
1,Entertainment,693016,2,316,0.006329,1,0.0,True,Lowest similarityBAC
2,Main_topic_classifications,7345184,0,148,0.0,0,0.0,False,


In [144]:
childPages(7345184)

Unnamed: 0,pages.title,pages.id
0,History,693555
1,World,3260154
2,Reference,722196
3,People,691008
4,Law,691928
5,Religion,692694
6,Education,696763
7,Philosophy,691810
8,Politics,695027
9,Science_and_technology,47642171


In [89]:
len(test.loc[test["pages.id"] == 7345184]) == 1

True

In [119]:
int(test.loc[test["mostSimilar"] == "True", "pages.id"])

737235

Unnamed: 0,pages.title,pages.id,intersection,union,jaccard,depth,similarityBAC,mostSimilar,comment


## Step A
##### Article - Category relationship strength (TBD)