In [None]:
## Sequence
# Build database from file
# Add isCategory property
# Add Exclude label based on matching lists
# Drop nodes marked for exclusion
# Add Exclude-Belongs_to label unwanted rels (that should'nt be deleted totally) (e.g to node "People")
# Remove Belongs_to from above mentioned rels

In [1]:
# Next:
# Clean non-wanted nodes from graph (container categories etc.)
#    Try: https://neo4j.com/docs/graph-algorithms/current/projected-graph-model/cypher-projection/
# Write custom similarity calculation function
# Write iterator function

In [2]:
# NOTE
# cypher.forbid_exhaustive_shortestpath=true set in neo4j conf file
# https://neo4j.com/docs/operations-manual/current/configuration/neo4j-conf/

In [3]:
from py2neo import *
import pandas as pd

In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [5]:
graph = Graph()

## FUNCTIONS

##### Return similarity statistics for two sets (intersection, union, Jaccard coefficient)

In [6]:
# compute similarity statistics
def similarityStats(a,b):
    intSize = len(a.intersection(b))
    unionSize = len(a.union(b))
    return (intSize, unionSize, intSize / unionSize)

##### Return identifying information of parent categories of chosen article or category as pandas dataframe

In [7]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentCategories(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Category:Page) \
                <-[:BELONGS_TO]- \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    return graph.run(commandToRun).to_data_frame()


##### Return identifying information of children (both category and article) of chosen category as pandas dataframe

In [8]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def childPages(a):
    wikiID = a
    commandToRun = 'MATCH (pages:Page) \
                -[:BELONGS_TO]-> \
                (:Page {id: %s}) \
                RETURN pages.title, pages.id' % (wikiID)

    return graph.run(commandToRun).to_data_frame()

##### Return dataframe with all articles (but not categories) with given title [only one result is expected]

In [9]:
# Give wikipedia title as string as function argument
def articleByTitle(a):
    ArticleToFind = a
    commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                    WHERE NONE(art IN [articles] WHERE art:Category) \
                    RETURN articles.title, articles.id, ID(articles)' % (ArticleToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe with all categories (but not articles) with given title [only one result is expected]

In [10]:
# Give wikipedia title as string as function argument
def categoryByTitle(a):
    CategoryToFind = a
    commandToRun = 'MATCH (categories:Category:Page {title: "%s"}) \
                    RETURN categories.title, categories.id, ID(categories)' % (CategoryToFind)
    return graph.run(commandToRun).to_data_frame()

##### Return dataframe containing all parent categories of category a and similarity statistics to each

In [11]:
# Give wikipedia id as integer as function argument (e.g. cateogry "Finland" = 693995)
def parentSimilarities(a):
    parents = parentCategories(a)
    children = childPages(a)
    parents["similarities"] = parents["pages.id"].apply(lambda x: similarityStats(set(children["pages.id"]), set(childPages(x)["pages.id"])))
    parents[["intersection", "union", "jaccard"]] = pd.DataFrame(parents['similarities'].tolist(), index = parents.index)
    parents.drop(["similarities"], axis = 1, inplace = True)
    parents.sort_values(by = "jaccard", ascending = False, inplace = True)
    
    return parents

##### Return dataframe containing shortest path between input node (article or category) and Main_topics_classification category

In [12]:
def shortestPathToMTC(a):
    # (:Page {id: 7345184}) is Main_topics_classifications category node
    inputNode = a

    commandToRun = 'MATCH path=shortestPath( \
                    (:Page {id: %s})-[:BELONGS_TO*0..10]-(:Page {id: 7345184})) \
                    UNWIND nodes(path) AS pages \
                    RETURN pages.title, pages.id, ID(pages)' % (inputNode)

    return pd.DataFrame(graph.run(commandToRun).data())

##### Return node based on neo4j database ID [NOTE: not same as wikipedia ID used elsewhere]

In [13]:
def getWithNeoID(a):
    return NodeMatcher(graph).get(a)

## Adding node labels and properties

#### Database edits (run commands commented out to prevent accidental runs)

In [None]:
# 1. Add pageType property to each node (Category / Article)
# 2. Add Include label to each node not on notWanted list
# 3. Create native projection based on Include label
#¨4. Test shorestPath and similarity algorithms on native projection

In [None]:
%%time
# Add pageType property with value "Category" to all category pages

commandToRun = 'MATCH (pages:Category:Page) \
                SET pages.pageType = "Category"'

#graph.run(commandToRun)

In [None]:
%%time
# Add pageType property with value "Article" to all article pages

commandToRun = 'MATCH (pages:Page) \
                WHERE NONE(art IN [pages] WHERE art:Category) \
                SET pages.pageType = "Article"'

#graph.run(commandToRun)

In [None]:
%%time
# Remove pageType property from category pages
# NOTE!! Run out of allocated memory when run on all nodes (cat + art)

commandToRun = "MATCH (pages:Category:Page) \
                REMOVE pages.pageType"

#graph.run(commandToRun)

In [None]:
%%time
# Remove pageType property from article pages

commandToRun = "MATCH (pages:Page) \
                WHERE NONE(art IN [pages] WHERE art:Category) \
                REMOVE pages.pageType"

#graph.run(commandToRun)

In [None]:
%%time
# Add isCategory property with value 1 to all category pages

commandToRun = 'MATCH (pages:Category:Page) \
                SET pages.isCategory = 1'

#graph.run(commandToRun)

In [None]:
%%time
# Add isCategory property with value 0 to all article pages

commandToRun = 'MATCH (pages:Page) \
                WHERE NONE(art IN [pages] WHERE art:Category) \
                SET pages.isCategory = 0'

#graph.run(commandToRun)

In [None]:
%%time
# Add label "Include" to all pages (Wall time: 24min 41s)

commandToRun = "MATCH (pages:Page) \
                SET pages:Include"

#graph.run(commandToRun)

In [None]:
%%time
# Remove label "Include" from pages included in list of categories to exclude (Wall time: 1min 33s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title STARTS WITH 'Wikipedia_' \
                OR pages.title STARTS WITH '1' \
                OR pages.title STARTS WITH '2' \
                OR pages.title STARTS WITH '3' \
                OR pages.title STARTS WITH '4' \
                OR pages.title STARTS WITH '5' \
                OR pages.title STARTS WITH '6' \
                OR pages.title STARTS WITH '7' \
                OR pages.title STARTS WITH '8' \
                OR pages.title STARTS WITH '9' \
                OR pages.title STARTS WITH '0' \
                OR pages.title STARTS WITH 'List_of' \
                OR pages.title STARTS WITH 'All_articles' \
                OR pages.title STARTS WITH 'Articles_' \
                OR pages.title CONTAINS 'by_year' \
                OR pages.title CONTAINS 'of_the_year' \
                OR pages.title CONTAINS '_in_' \
                REMOVE pages:Include"

#graph.run(commandToRun)

In [None]:
%%time
# Add label "Exclude" to all pages to exclude (Wall time: 1min 31s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title STARTS WITH 'Wikipedia_' \
                OR pages.title STARTS WITH '1' \
                OR pages.title STARTS WITH '2' \
                OR pages.title STARTS WITH '3' \
                OR pages.title STARTS WITH '4' \
                OR pages.title STARTS WITH '5' \
                OR pages.title STARTS WITH '6' \
                OR pages.title STARTS WITH '7' \
                OR pages.title STARTS WITH '8' \
                OR pages.title STARTS WITH '9' \
                OR pages.title STARTS WITH '0' \
                OR pages.title STARTS WITH 'List_of' \
                OR pages.title STARTS WITH 'All_articles' \
                OR pages.title STARTS WITH 'Articles_' \
                OR pages.title CONTAINS 'by_year' \
                OR pages.title CONTAINS 'of_the_year' \
                OR pages.title CONTAINS '_in_' \
                SET pages:Exclude"

#graph.run(commandToRun)

In [31]:
%%time
# Add label "Exclude" to all pages to exclude (Wall time: 6.69 s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title CONTAINS '_categories' \
                OR pages.title = 'Webarchive_template_wayback_links' \
                SET pages:Exclude"

#graph.run(commandToRun)

Wall time: 6.69 s


<py2neo.database.Cursor at 0x1db4100e850>

In [32]:
%%time
# DETACH and DELETE all nodes with label Exclude, iterate using APOC (Wall time: 39min 21s)
# https://neo4j.com/developer/kb/large-delete-transaction-best-practices-in-neo4j/

commandToRun = "CALL apoc.periodic.iterate('MATCH (pages:Exclude) \
                RETURN pages', \
                'DETACH DELETE pages', \
                {batchSize:1000}) \
                YIELD batches, total \
                RETURN batches, total"

#graph.run(commandToRun)


Wall time: 15.3 s


<py2neo.database.Cursor at 0x1db41925100>

In [None]:
# Above started 13:45

In [None]:
'''
call apoc.periodic.iterate("MATCH (n:Foo) where n.foo='bar' return n", "DETACH DELETE n", {batchSize:1000})
yield batches, total return batches, total
'''

#### Testing for database edits

In [None]:
articleByTitle("Umeå")

In [None]:
ArticleToFind = "1980_births"
commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                    RETURN articles, labels(articles)' % (ArticleToFind)
graph.run(commandToRun).to_data_frame()

In [None]:
ArticleToFind = "1930–31_in_Mandatory_Palestine_football"
commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                    RETURN articles, labels(articles)' % (ArticleToFind)
graph.run(commandToRun).to_data_frame()

In [None]:
ArticleToFind = "Umeå"
commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                    WHERE NONE(art IN [articles] WHERE art:Category) \
                    RETURN labels(articles)' % (ArticleToFind)
graph.run(commandToRun).to_data_frame()

In [None]:
%%time
# Add label "Test1" to article "Umeå" {'isRedirect': False, 'id': 1017521, 'isNew': False, 'title': 'Umeå'}

ArticleId = 1017521
commandToRun = 'MATCH (articles {id: %s}) \
                    SET articles:Test1 \
                    RETURN articles.title, labels(articles) AS labels' % (ArticleId)
graph.run(commandToRun).to_data_frame()

In [None]:
%%time
# Remove label "Test1" from article "Umeå" {'isRedirect': False, 'id': 1017521, 'isNew': False, 'title': 'Umeå'}

ArticleId = 1017521
commandToRun = 'MATCH (articles {id: %s}) \
                    REMOVE articles:Test1 \
                    RETURN articles.title, labels(articles) AS labels' % (ArticleId)
graph.run(commandToRun).to_data_frame()

In [None]:
%%time
# Add property "type" with value "Article" to article "Umeå" {'isRedirect': False, 'id': 1017521, 'isNew': False, 'title': 'Umeå'}

ArticleId = 1017521
commandToRun = 'MATCH (articles {id: %s}) \
                    SET articles.type = "Article" \
                    RETURN articles.title, articles.id, articles.type' % (ArticleId)
graph.run(commandToRun).to_data_frame()

In [None]:
%%time
# Remove property "type" from article "Umeå" {'isRedirect': False, 'id': 1017521, 'isNew': False, 'title': 'Umeå'}

ArticleId = 1017521
commandToRun = 'MATCH (articles {id: %s}) \
                    REMOVE articles.type \
                    RETURN articles.title, articles.id, articles.type' % (ArticleId)
graph.run(commandToRun).to_data_frame()

In [None]:
%%time
# Add property "test" with values "Test2" to parent categories of "Umeå" {'isRedirect': False, 'id': 1017521, 'isNew': False, 'title': 'Umeå'}
wikiID = 1017521
commandToRun = 'MATCH (pages:Category:Page) \
                <-[:BELONGS_TO]- \
                (:Page {id: %s}) \
                SET pages.test = "Test2"' % (wikiID)

graph.run(commandToRun).to_data_frame()

In [None]:
%%time
# Remove property "test" from parent categories of "Umeå" {'isRedirect': False, 'id': 1017521, 'isNew': False, 'title': 'Umeå'}
wikiID = 1017521
commandToRun = 'MATCH (pages:Category:Page) \
                <-[:BELONGS_TO]- \
                (:Page {id: %s}) \
                REMOVE pages.test \
                RETURN pages.title, pages.id, pages.test' % (wikiID)

graph.run(commandToRun).to_data_frame()

In [None]:
ArticleToFind = "Articles_with_hAudio_microformats"
commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                    RETURN articles, labels(articles)' % (ArticleToFind)
graph.run(commandToRun).to_data_frame()

## Step A
##### Article - Category relationship strength (TBD)

## Step B
##### Calculate similarity between two categories (Note: not an article and a category)

In [60]:
%%time
# function to calculate similarity between Category and all parent categories
# Run-time for Category "Finland" on first run = 2.51 s
chosenCategory = "History"

chosenCatID = categoryByTitle(chosenCategory).iloc[0,1]
parentSimilarities(chosenCatID)

Wall time: 289 ms


Unnamed: 0,pages.title,pages.id,intersection,union,jaccard
0,Past,40822338,1,63,0.015873
2,Humanities,1004110,2,126,0.015873
1,Main_topic_classifications,7345184,1,66,0.015152
3,Change,51137756,0,74,0.0
4,Social_sciences,695042,0,121,0.0


In [None]:
# Next: write iteration bottom-up
# Write breadth first traversal top-down

In [68]:
%%time
# (:Page {id: 7345184}) is Main_topics_classifications category node

inputNode = 4689264

commandToRun = 'MATCH path=shortestPath( \
                (:Page {id: %s})-[:BELONGS_TO*0..10]->(:Page {id: 7345184})) \
                UNWIND nodes(path) AS pages \
                RETURN pages.title, pages.id, ID(pages)' % (inputNode)

'''cursor = graph.run(commandToRun)
while cursor.forward():
    print(cursor.current)'''

shortestPath = pd.DataFrame(graph.run(commandToRun).data())
shortestPath

Wall time: 11 ms


Unnamed: 0,pages.title,pages.id,ID(pages)
0,Australia,4689264,1238991
1,Australia,693538,5215234
2,Island_countries,7213567,5603775
3,Countries,691232,5214889
4,World,3260154,5846218
5,Main_topic_classifications,7345184,5614592


In [None]:
%%time
shortestPathToMTC(31557)

In [None]:
# MATCH path=shortestPath((station_44:STATION {id:44})-[*0..10]-(station_46:STATION {id:46}))
# RETURN path

In [None]:
categoryByTitle('Articles')

In [67]:
articleByTitle('Australia')

Unnamed: 0,articles.title,articles.id,ID(articles)
0,Australia,4689264,1238991


In [None]:
getWithNeoID(2012406)

In [None]:
childPages(36)

In [None]:
%%time
parentCategories(36)

In [None]:
%%time
childPages(693995)

In [None]:
%%time
# Return Category with specific title only if is a category

CategoryToFind = 'Contents'
commandToRun = 'MATCH (categories:Category:Page {title: "%s"}) \
                RETURN categories.title, categories.id, ID(categories)' % (CategoryToFind)
namedCategory = graph.run(commandToRun).to_data_frame()
namedCategory

In [None]:
set(parentCats["pages.id"])

In [None]:
# Perform similarity calculation on each parent

In [None]:
# Select highest similarity

In [None]:
%%time
# Query giving all category pages
commandToRun = 'MATCH (pages:Category:Page) \
                RETURN pages.title, pages.id'

allCategories = graph.run(commandToRun).to_data_frame()

In [None]:
%%time
# Query giving all article pages
commandToRun = 'MATCH (pages:Page) \
                WHERE NONE(art IN [pages] WHERE art:Category) \
                RETURN pages.title, pages.id'

allArticles = graph.run(commandToRun).to_data_frame()

In [None]:
%%time
# Query giving all Include pages
commandToRun = 'MATCH (pages:Include) \
                RETURN pages.title, pages.id'

allInclude = graph.run(commandToRun).to_data_frame()

In [None]:
allInclude

# Test stuff -->

In [None]:
# Clean unwatnted nodes
# Return nodes with title containing unwanted strings
# Create graph excluding these

In [None]:
“by_year”, “of_the_year”, “List_of”, or “_in_”
'Wikipedia_'

In [None]:
MATCH (n:Person)
WHERE n.name = 'Peter' XOR (n.age < 30 AND n.name = 'Timothy') OR NOT (n.name = 'Timothy' OR n.name = 'Peter')
RETURN n.name, n.age

In [None]:
%%time
#Query returning notWanted category nodes
commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title STARTS WITH 'Wikipedia_' \
                OR pages.title STARTS WITH '1' \
                OR pages.title STARTS WITH '2' \
                OR pages.title STARTS WITH '3' \
                OR pages.title STARTS WITH '4' \
                OR pages.title STARTS WITH '5' \
                OR pages.title STARTS WITH '6' \
                OR pages.title STARTS WITH '7' \
                OR pages.title STARTS WITH '8' \
                OR pages.title STARTS WITH '9' \
                OR pages.title STARTS WITH '0' \
                OR pages.title STARTS WITH 'List_of' \
                OR pages.title STARTS WITH 'All_articles' \
                OR pages.title STARTS WITH 'Articles_' \
                OR pages.title CONTAINS 'by_year' \
                OR pages.title CONTAINS 'of_the_year' \
                OR pages.title CONTAINS '_in_' \
                RETURN pages.title, pages.id \
                "
nonWanted = graph.run(commandToRun).to_data_frame()

In [None]:
#nonWanted.loc[nonWanted["pages.title"] == "Wikipedia_articles_with_GND_identifiers", :]
nonWanted[100000:100150]

In [None]:
# len "STARTS WITH 'Wikipedia_'" = 28776
nonWanted

In [None]:
%%time
booksC = graph.run('MATCH (pages:Category:Page)-[:BELONGS_TO]->(:Page {title: "Books"}) RETURN pages.title, pages.id').to_data_frame()

In [None]:
%%time
booksAll = graph.run('MATCH (pages:Page)-[:BELONGS_TO]->(:Page {title: "Books"}) RETURN pages.title, pages.id').to_data_frame()

In [None]:
%%time
# Returns articles belonging to category "Books"
booksArt = graph.run('MATCH (pages:Page)-[:BELONGS_TO]->(:Page {title: "Books"}) WHERE NONE(page IN [pages] WHERE page:Category) RETURN pages.title, pages.id').to_data_frame()

In [None]:
%%time
booksAll2 = graph.run('MATCH (pages:Page)<-[:BELONGS_TO]-(:Page {title: "Jacob_Abbot_Cummings"}) RETURN pages').to_data_frame()

In [None]:
print(len(booksC))
print(len(booksArt))
print(len(booksAll))

In [None]:
booksAll2

## Step 1
##### Take article title as input and return dataframe with identifying information

In [None]:
%%time
# Return page with specific title only if is not category
# Without WHERE clause, will return both article and category of same name if it exists

ArticleToFind = 'Cat'
commandToRun = 'MATCH (articles:Page {title: "%s"}) \
                WHERE NONE(art IN [articles] WHERE art:Category) \
                RETURN articles.title, articles.id, ID(articles)' % (ArticleToFind)
namedArticle = graph.run(commandToRun).to_data_frame()
namedArticle

In [None]:
# Approach 2 - NOT NECESSARY

In [None]:
matcher = NodeMatcher(graph)

In [None]:
ArtWikiID = namedArticle["ID(articles)"][0]

In [None]:
len(graph.match((matcher.get(ArtWikiID), None ), "BELONGS_TO"))

In [None]:
list(graph.match((matcher.get(ArtWikiID), None ), "BELONGS_TO"))

In [None]:
matcher.get(12)