# Neo4j database setup

The wikipedia metadata previously extracted to the categories csv-file is added to a neo4j graph database. The graph that has 8 million nodes and 30 million edges is too large to analyze in RAM. 

In this file the graph database is cleaned for analyses.

In [None]:
# NOTE
# cypher.forbid_exhaustive_shortestpath=true set in neo4j conf file
# https://neo4j.com/docs/operations-manual/current/configuration/neo4j-conf/

In [None]:
from py2neo import *

In [None]:
graph = Graph()

#### Database edits (run commands commented out to prevent accidental runs)

##### Marking categories to drop with label ":Exclude"

In [None]:
%%time
# Add label "Exclude" to all pages to exclude (Wall time: 1min 31s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title STARTS WITH 'Wikipedia_' \
                OR pages.title STARTS WITH '1' \
                OR pages.title STARTS WITH '2' \
                OR pages.title STARTS WITH '3' \
                OR pages.title STARTS WITH '4' \
                OR pages.title STARTS WITH '5' \
                OR pages.title STARTS WITH '6' \
                OR pages.title STARTS WITH '7' \
                OR pages.title STARTS WITH '8' \
                OR pages.title STARTS WITH '9' \
                OR pages.title STARTS WITH '0' \
                OR pages.title STARTS WITH 'List_of' \
                OR pages.title STARTS WITH 'All_articles' \
                OR pages.title STARTS WITH 'Articles_' \
                OR pages.title CONTAINS 'by_year' \
                OR pages.title CONTAINS 'of_the_year' \
                OR pages.title CONTAINS '_in_' \
                SET pages:Exclude"

#graph.run(commandToRun)

In [None]:
%%time
# Add label "Exclude" to all pages to exclude (Wall time: 6.69 s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title CONTAINS '_categories' \
                OR pages.title = 'Webarchive_template_wayback_links' \
                SET pages:Exclude"

#graph.run(commandToRun)

In [None]:
%%time
# Add label "Exclude" to all pages to exclude (Wall time: 6.69 s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title = 'People_by_status' \
                SET pages:Exclude"

#graph.run(commandToRun)

In [None]:
%%time
# Add label "Exclude" to all pages to exclude (Wall time:  s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title = 'Categories_by_language' \
                SET pages:Exclude"

#graph.run(commandToRun)

In [None]:
%%time
# Add label "Exclude" to all pages to exclude (Wall time: 6.69 s)

commandToRun = "MATCH (pages:Category:Page) \
                WHERE \
                pages.title = 'Sources' \
                SET pages:Exclude"

#graph.run(commandToRun)

##### Dropping categories with label ":Exclude"

In [None]:
%%time
# DETACH and DELETE all nodes with label Exclude, iterate using APOC (Wall time: 39min 21s)
# https://neo4j.com/developer/kb/large-delete-transaction-best-practices-in-neo4j/

commandToRun = "CALL apoc.periodic.iterate('MATCH (pages:Exclude) \
                RETURN pages', \
                'DETACH DELETE pages', \
                {batchSize:1000}) \
                YIELD batches, total \
                RETURN batches, total"

#graph.run(commandToRun)

##### Changing relations to optimize category tree

In [None]:
%%time
# Add [BELONGS_TO_CUT] relationship between MTC and People
# MTC wikipedia-id: 7345184, People wikipedia-id: 691008
commandToRun = 'MATCH (MTC:Page {id: 7345184}), (People:Page {id: 691008}) \
                CREATE (MTC) <-[:BELONGS_TO_CUT]- (People)'
# graph.run(commandToRun).data()

In [None]:
%%time
# Remove [BELONGS_TO] relationship between MTC and People
# MTC wikipedia-id: 7345184, People wikipedia-id: 691008
commandToRun = 'MATCH (MTC:Page {id: 7345184}) <-[r:BELONGS_TO]- (People:Page {id: 691008}) \
                DELETE r'
# graph.run(commandToRun).data()

In [None]:
%%time
# Add [BELONGS_TO_CUT] relationship between MTC and World
# MTC wikipedia-id: 7345184, World wikipedia-id: 3260154
commandToRun = 'MATCH (MTC:Page {id: 7345184}), (World:Page {id: 3260154}) \
                CREATE (MTC) <-[:BELONGS_TO_CUT]- (World)'
#graph.run(commandToRun).data()

In [None]:
%%time
# Remove [BELONGS_TO] relationship between MTC and World
# MTC wikipedia-id: 7345184, World wikipedia-id: 3260154
commandToRun = 'MATCH (MTC:Page {id: 7345184}) <-[r:BELONGS_TO]- (World:Page {id: 3260154}) \
                DELETE r'
#graph.run(commandToRun).data()

In [None]:
%%time
# Add [BELONGS_TO] relationship between MTC and Entertainment
# MTC wikipedia-id: 7345184, Entertainment wikipedia-id: 693016
commandToRun = 'MATCH (MTC:Page {id: 7345184}), (Entertainment:Page {id: 693016}) \
                CREATE (MTC) <-[:BELONGS_TO]- (Entertainment)'
#graph.run(commandToRun).data()

In [None]:
%%time
# Add [BELONGS_TO_ADDED] relationship between MTC and Entertainment
# MTC wikipedia-id: 7345184, Entertainment wikipedia-id: 693016
commandToRun = 'MATCH (MTC:Page {id: 7345184}), (Entertainment:Page {id: 693016}) \
                CREATE (MTC) <-[:BELONGS_TO_ADDED]- (Entertainment)'
#graph.run(commandToRun).data()