In [1]:
# Next:
# Clean non-wanted nodes from graph (container categories etc.)
#    Try: https://neo4j.com/docs/graph-algorithms/current/projected-graph-model/cypher-projection/
# Write custom similarity calculation function
# Write iterator function

In [2]:
# NOTE
# cypher.forbid_exhaustive_shortestpath=true set in neo4j conf file
# https://neo4j.com/docs/operations-manual/current/configuration/neo4j-conf/

In [3]:
from py2neo import *
import pandas as pd

In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [5]:
graph = Graph()

## FUNCTIONS

##### Return similarity statistics for two sets (intersection, union, Jaccard coefficient)

In [6]:
# compute similarity statistics
def similarityStats(a,b):
    intSize = len(a.intersection(b))
    unionSize = len(a.union(b))
    return (intSize, unionSize, intSize / unionSize)

##### Return identifying information of parent categories of chosen article or category as pandas dataframe

In [7]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentCategories(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Category:Page) \
                <-[:BELONGS_TO]- \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    return graph.run(commandToRun).to_data_frame()


##### Return identifying information of children (both category and article) of chosen category as pandas dataframe

In [8]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def childPages(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Page) \
                -[:BELONGS_TO]-> \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    return graph.run(commandToRun).to_data_frame()

##### Return dataframe with all articles (but not categories) with given title [only one result is expected]

In [9]:
# Give wikipedia title as string as function argument
def articleByTitle(a):
    ArticleToFind = a
    commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                    WHERE NONE(art IN [articles] WHERE art:Category) \
                    RETURN articles.title, articles.id, ID(articles)' % (ArticleToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe with all categories (but not articles) with given title [only one result is expected]

In [10]:
# Give wikipedia title as string as function argument
def categoryByTitle(a):
    CategoryToFind = a
    commandToRun = 'MATCH (categories:Category:Page {title: "%s"}) \
                    RETURN categories.title, categories.id, ID(categories)' % (CategoryToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe containing all parent categories of category a and similarity statistics to each

In [11]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentSimilarities(a):
    parents = parentCategories(a)
    children = childPages(a)
    parents["similarities"] = parents["pages.id"].apply(lambda x: similarityStats(set(children["pages.id"]), set(childPages(x)["pages.id"])))
    parents[["intersection", "union", "jaccard"]] = pd.DataFrame(parents['similarities'].tolist(), index = parents.index)
    parents.drop(["similarities"], axis = 1, inplace = True)
    parents.sort_values(by = "jaccard", ascending = False, inplace = True)
    
    return parents

##### Return dataframe containing shortest path between input node (article or category) and Main_topics_classification category

In [12]:
def shortestPathToMTC(a):
    # (:Page {id: 7345184}) is Main_topics_classifications category node
    inputNode = a

    commandToRun = 'MATCH path=shortestPath( \
                    (:Page {id: %s})-[:BELONGS_TO*0..10]-(:Page {id: 7345184})) \
                    UNWIND nodes(path) AS pages \
                    RETURN pages.title, pages.id, ID(pages)' % (inputNode)

    return pd.DataFrame(graph.run(commandToRun).data())

##### Return node based on neo4j database ID [NOTE: not same as wikipedia ID used elsewhere]

In [13]:
def getWithNeoID(a):
    return NodeMatcher(graph).get(a)

## Step A
##### Article - Category relationship strength (TBD)

## Step B
##### Calculate similarity between two categories (Note: not an article and a category)

In [11]:
%%time
# function to calculate similarity between Category and all parent categories
# Run-time for Category "Finland" on first run = 2.51 s
chosenCategory = "History"

chosenCatID = categoryByTitle(chosenCategory).iloc[0,1]
parentSimilarities(chosenCatID)

Wall time: 2.51 s


Unnamed: 0,pages.title,pages.id,intersection,union,jaccard
2,Humanities,1004110,2,128,0.015625
0,Past,40822338,1,65,0.015385
1,Main_topic_classifications,7345184,1,68,0.014706
3,Change,51137756,0,76,0.0
4,Social_sciences,695042,0,125,0.0


In [None]:
# Next: write iteration bottom-up
# Write breadth first traversal top-down

In [73]:
%%time
# (:Page {id: 7345184}) is Main_topics_classifications category node

inputNode = 1017420

commandToRun = 'MATCH path=shortestPath( \
                (:Page {id: %s})-[:BELONGS_TO*0..10]-(:Page {id: 7345184})) \
                UNWIND nodes(path) AS pages \
                RETURN pages.title, pages.id, ID(pages)' % (inputNode)

'''cursor = graph.run(commandToRun)
while cursor.forward():
    print(cursor.current)'''

shortestPath = pd.DataFrame(graph.run(commandToRun).data())
shortestPath

Wall time: 73.3 ms


Unnamed: 0,pages.title,pages.id,ID(pages)
0,Presidents_of_Finland,1017420,5548753
1,Wikipedia_categories_named_after_Presidents_of_Finland,54302930,7278282
2,Container_categories,30176254,6523822
3,Main_topic_classifications,7345184,5614592


In [84]:
%%time
shortestPathToMTC(31557)

Wall time: 1.08 s


Unnamed: 0,pages.title,pages.id,ID(pages)
0,"The_Good,_the_Bad_and_the_Ugly",31557,61550
1,Wikipedia_articles_with_GND_identifiers,38547977,6639522
2,Law,18949668,2369701
3,Law,691928,5214980
4,Main_topic_classifications,7345184,5614592


In [None]:
# MATCH path=shortestPath((station_44:STATION {id:44})-[*0..10]-(station_46:STATION {id:46}))
# RETURN path

In [98]:
categoryByTitle('Articles')

Unnamed: 0,categories.title,categories.id,ID(categories)
0,Articles,14104879,5721289


In [83]:
articleByTitle('The_Good,_the_Bad_and_the_Ugly')

Unnamed: 0,articles.title,articles.id,ID(articles)
0,"The_Good,_the_Bad_and_the_Ugly",31557,61550


In [116]:
getWithNeoID(61550)

(_61550:Page {id: 31557, isNew: false, isRedirect: false, title: 'The_Good,_the_Bad_and_the_Ugly'})

In [106]:
childPages(36)

In [105]:
%%time
parentCategories(36)

Wall time: 23.9 ms


In [None]:
%%time
childPages(693995)

In [104]:
%%time
# Return Category with specific title only if is a category

CategoryToFind = 'Contents'
commandToRun = 'MATCH (categories:Category:Page {title: "%s"}) \
                RETURN categories.title, categories.id, ID(categories)' % (CategoryToFind)
namedCategory = graph.run(commandToRun).to_data_frame()
namedCategory

Wall time: 19.5 ms


Unnamed: 0,categories.title,categories.id,ID(categories)
0,Contents,14105005,5721293


In [None]:
set(parentCats["pages.id"])

In [None]:
# Perform similarity calculation on each parent

In [None]:
# Select highest similarity

# Test stuff -->

In [None]:
# Clean unwatnted nodes
# Return nodes with title containing unwanted strings
# Create graph excluding these

In [None]:
“by_year”, “of_the_year”, “List_of”, or “_in_”
'Wikipedia_'

In [None]:
MATCH (n:Person)
WHERE n.name = 'Peter' XOR (n.age < 30 AND n.name = 'Timothy') OR NOT (n.name = 'Timothy' OR n.name = 'Peter')
RETURN n.name, n.age

In [33]:
%%time
commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title STARTS WITH 'Wikipedia_' \
                OR pages.title STARTS WITH '1' \
                OR pages.title STARTS WITH '2' \
                OR pages.title STARTS WITH '3' \
                OR pages.title STARTS WITH '4' \
                OR pages.title STARTS WITH '5' \
                OR pages.title STARTS WITH '6' \
                OR pages.title STARTS WITH '7' \
                OR pages.title STARTS WITH '8' \
                OR pages.title STARTS WITH '9' \
                OR pages.title STARTS WITH '0' \
                OR pages.title STARTS WITH 'List_of' \
                OR pages.title CONTAINS 'by_year' \
                OR pages.title CONTAINS 'of_the_year' \
                OR pages.title CONTAINS '_in_' \
                RETURN pages.title, pages.id \
                "
nonWanted = graph.run(commandToRun).to_data_frame()

Wall time: 14.5 s


In [34]:
#nonWanted.loc[nonWanted["pages.title"] == "Wikipedia_articles_with_GND_identifiers", :]
nonWanted

Unnamed: 0,pages.title,pages.id
0,A-League_PFA_team_of_the_year_navigational_boxes,42801306
1,American_college_men\'s_basketball_conference_coach_of_the_year_navigational_boxes,37812788
2,American_college_men\'s_basketball_conference_player_of_the_year_navigational_boxes,37812791
3,American_college_men\'s_soccer_conference_coach_of_the_year_navigational_boxes,53300185
4,American_college_men\'s_soccer_conference_player_of_the_year_navigational_boxes,52779522
...,...,...
720646,1′E_h4v_locomotives,58041219
720647,1′E_locomotives,57798512
720648,1′Eh2_locomotives,57798510
720649,1′Eo1′_locomotives,57546804


In [17]:
# len "STARTS WITH 'Wikipedia_'" = 28776
nonWanted

Unnamed: 0,pages.title,pages.id
0,Wikipedia_1.0_Arts_articles_by_quality,6024513
1,Wikipedia_1.0_assessments,4887735
2,Wikipedia_1.0_historical_pages,28286276
3,Wikipedia_10,38308958
4,Wikipedia_15,48576743
...,...,...
28771,Wikipedia_warning_essays,42046692
28772,Wikipedia_women,57968295
28773,Wikipedia_workpages,11376362
28774,Wikipedia_workshops_in_Mumbai,40149560


In [None]:
%%time
booksC = graph.run('MATCH (pages:Category:Page)-[:BELONGS_TO]->(:Page {title: "Books"}) RETURN pages.title, pages.id').to_data_frame()

In [None]:
%%time
booksAll = graph.run('MATCH (pages:Page)-[:BELONGS_TO]->(:Page {title: "Books"}) RETURN pages.title, pages.id').to_data_frame()

In [None]:
%%time
# Returns articles belonging to category "Books"
booksArt = graph.run('MATCH (pages:Page)-[:BELONGS_TO]->(:Page {title: "Books"}) WHERE NONE(page IN [pages] WHERE page:Category) RETURN pages.title, pages.id').to_data_frame()

In [None]:
%%time
booksAll2 = graph.run('MATCH (pages:Page)<-[:BELONGS_TO]-(:Page {title: "Jacob_Abbot_Cummings"}) RETURN pages').to_data_frame()

In [None]:
print(len(booksC))
print(len(booksArt))
print(len(booksAll))

In [None]:
booksAll2

## Step 1
##### Take article title as input and return dataframe with identifying information

In [9]:
%%time
# Return page with specific title only if is not category
# Without WHERE clause, will return both article and category of same name if it exists

ArticleToFind = 'Cat'
commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                WHERE NONE(art IN [articles] WHERE art:Category) \
                RETURN articles.title, articles.id, ID(articles)' % (ArticleToFind)
namedArticle = graph.run(commandToRun).to_data_frame()
namedArticle

Wall time: 87.2 ms


Unnamed: 0,articles.title,articles.id,ID(articles)
0,Cat,6678,7953


In [None]:
# Approach 2 - NOT NECESSARY

In [107]:
matcher = NodeMatcher(graph)

In [None]:
ArtWikiID = namedArticle["ID(articles)"][0]

In [None]:
len(graph.match((matcher.get(ArtWikiID), None ), "BELONGS_TO"))

In [None]:
list(graph.match((matcher.get(ArtWikiID), None ), "BELONGS_TO"))

In [111]:
matcher.get(12)

(_12:Page {id: 332, isNew: false, isRedirect: false, title: 'Animalia_(book)'})