In [None]:
#STEP 1 : Generate fake data using GraphAware graphgen
# https://github.com/graphaware/neo4j-graphgen-procedure
# you will need to compile the graphgen .jar file and add it to Neo4j/plugins and restart Neo4j
# (tip: update to JDK 8)

#!pip install neo4j-driver

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)

session = driver.session()


generate1 = 
'''
CALL generate.nodes('Organization', '{name: companyName, country: country}', 1000000)
'''

session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(generate1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()




In [8]:
#STEP 2:  Set index on child property and constraint on parent category

#!pip install neo4j-driver

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)

session = driver.session()


index1 = '''
CREATE INDEX ON :Organization(country)
'''

constraint1 = '''
CREATE CONSTRAINT ON (n:Country) ASSERT n.countryName IS UNIQUE;
'''

session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(index1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()


session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(constraint1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()



processing...
<neo4j.v1.session.ResultSummary object at 0x10936a8d0>
{}
5.2  ms elapsed time
-----------------
processing...
<neo4j.v1.session.ResultSummary object at 0x10936ab00>
{'constraints_added': 1}
323.8  ms elapsed time
-----------------


In [9]:
#STEP 4: (Optional)  Warmup Page Cache, helpful for large graphs
# requires APOC procedures
# https://neo4j-contrib.github.io/neo4j-apoc-procedures/

#!pip install neo4j-driver

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)

session = driver.session()


warmpup1 = '''
CALL apoc.warmup.run();
'''

t0 = time.time()
print("processing...")

result = session.run(warmup1)

for record in result:
    print("%s" % (record)) 

summary = result.consume()
counters = summary.counters
print(counters)

print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')

session.close()

processing...
<Record pageSize=8192 nodesPerPage=546 nodesTotal=1000244 nodesLoaded=1831 nodesTime=0 relsPerPage=240 relsTotal=0 relsLoaded=0 relsTime=0 totalTime=0>
{}
113.0  ms elapsed time
-----------------


In [49]:
# STEP 5: Extract and create parent Category nodes

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)

session = driver.session()

cleanup1 = '''
MATCH (n:Country) DETACH DELETE n
'''

# extract from a 5% random sample
extractCategory1 = '''
MATCH (n:Organization) WHERE rand() < 0.05
WITH COLLECT(DISTINCT n.country) AS names
FOREACH (name IN names |
 MERGE (:Country {countryName: name}))
RETURN names
'''


# or extract from a full scan
extractCategory2 = '''
MATCH (n:Organization)
WITH COLLECT(DISTINCT n.country) AS names
FOREACH (name IN names |
 MERGE (:Country {countryName: name}))
RETURN names
'''


session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(cleanup1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()


session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(extractCategory1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()



processing...
<neo4j.v1.session.ResultSummary object at 0x1093c9908>
{'relationships_deleted': 1000000, 'nodes_deleted': 244}
16084.0  ms elapsed time
-----------------
processing...
<neo4j.v1.session.ResultSummary object at 0x109374208>
{'labels_added': 244, 'properties_set': 244, 'nodes_created': 244}
552.6  ms elapsed time
-----------------


In [50]:
# Refactor Organizations to Country, computed batches

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)
session = driver.session()

childProps1 = '''
MATCH (n:Organization)
WHERE NOT ((n)-[:HAS_LOCATION]-())
RETURN n.country AS pname, count(n) AS ncount
'''

indexRefactor1 = '''
MATCH (c:Country {countryName:{pname}}), (n:Organization {country:{pname}})
WHERE NOT ((n)-[:HAS_LOCATION]-())
WITH c,n LIMIT {limit} MERGE (c)<-[r:HAS_LOCATION]-(n)
'''
ntotal = 0
batchSize = 2000

try:
    tjob = time.time()
    print("processing query1 ---> getting child property list for refactoring")
    
    propertyList = session.run(childProps1)
    
    print('-----------------')
    print("processing query2 ---> starting refactoring to parent category")
    print ("%s %s" % ("using batch size: ", batchSize))
    print('-----------------')
 
    for property in propertyList:
        print("%s %s %s" % (property["pname"], "nodes:", property["ncount"]))
        t0 = time.time()
        pname = (property["pname"])
        ncount = (property["ncount"])
        
        batch = 0
        batches = 0
        
        batches = int(ncount/batchSize)+1
        
        for batch in range(batches):
            
            print("%s %s %s %s" % ("batch: ", batch+1 , " of ", batches))
        
            while True:
                result = session.run(indexRefactor1, {"pname": pname, "limit": batchSize})
                lresult=list(result)
                summary = result.consume()
                counters = summary.counters
                print(counters)
            
                if len(lresult) > 0:
                    print('--next batch--')
                else:
                    break
                    print('-----done-----')
                    
        print(round((time.time() - t0)*1000,1), " ms elapsed time")
        ntotal = ntotal + ncount
        print ("%s %s" % ("------------------------------> total refactored nodes: ", ntotal))
        
                
except Exception as e:
    print('*** Got exception',e)
    if not isinstance(e, CypherError):
        print('*** Rolling back')
        session.rollback()
    else:
        print('*** Not rolling back')
    
finally:
    print('Done!')
    print(round((time.time() - tjob)/60,1), " minutes elapsed time")

processing query1 ---> getting child property list for refactoring
-----------------
processing query2 ---> starting refactoring to parent category
using batch size:  2000
-----------------
Azerbaijan nodes: 3985
batch:  1  of  2
{'relationships_created': 2000}
batch:  2  of  2
{'relationships_created': 1985}
120.4  ms elapsed time
------------------------------> total refactored nodes:  3985
Saint Kitts and Nevis nodes: 4161
batch:  1  of  3
{'relationships_created': 2000}
batch:  2  of  3
{'relationships_created': 2000}
batch:  3  of  3
{'relationships_created': 161}
126.9  ms elapsed time
------------------------------> total refactored nodes:  8146
Egypt nodes: 3980
batch:  1  of  2
{'relationships_created': 2000}
batch:  2  of  2
{'relationships_created': 1980}
122.5  ms elapsed time
------------------------------> total refactored nodes:  12126
Saudi Arabia nodes: 4131
batch:  1  of  3
{'relationships_created': 2000}
batch:  2  of  3
{'relationships_created': 2000}
batch:  3  of 

In [48]:
# Refactor Organizations to Country, implicit batches

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)
session = driver.session()

childProps1 = '''
MATCH (n:Organization)
WHERE NOT ((n)-[:HAS_LOCATION]-())
RETURN n.country AS pname, count(n) AS ncount
'''

indexRefactor1 = '''
MATCH (c:Country {countryName:{pname}}), (n:Organization {country:{pname}})
WHERE NOT ((n)-[:HAS_LOCATION]-())
WITH c,n LIMIT {limit} MERGE (c)<-[r:HAS_LOCATION]-(n)
'''
ntotal = 0
batchSize = 2000


try:
    tjob = time.time()
    print("processing ---> getting child property list for refactoring")
    
    propertyList = session.run(childProps1)
    
    print('-----------------')
    print("processing ---> starting refactoring to parent category")
    print ("%s %s" % ("using batch size: ", batchSize))
    print('-----------------')
 
    for property in propertyList:
        print("%s %s %s" % (property["pname"], "nodes:", property["ncount"]))
        t0 = time.time()
        btotal=0
        pname = (property["pname"])
        ncount = (property["ncount"])
        
        while True:
            result = session.run(indexRefactor1, {"pname": pname, "limit": batchSize})
            btotal = btotal + batchSize
            summary = result.consume()
            counters = summary.counters
            print(counters)
            
            if btotal < ncount:
                print('--next batch--')
                
            else:
                print('-----done-----')
                break
                    
        print(round((time.time() - t0)*1000,1), " ms elapsed time")
        ntotal = ntotal + ncount
        print ("%s %s" % ("------------------------------> total refactored nodes: ", ntotal))
        
                
except Exception as e:
    print('*** Got exception',e)
    if not isinstance(e, CypherError):
        print('*** Rolling back')
        session.rollback()
    else:
        print('*** Not rolling back')
    
finally:
    print('Done!')
    print(round((time.time() - tjob)/60,1), " minutes elapsed time")

processing query1 ---> getting child property list for refactoring
-----------------
processing query2 ---> starting refactoring to parent category
using batch size:  2000
-----------------
Azerbaijan nodes: 3985
{'relationships_created': 2000}
--next batch--
{'relationships_created': 1985}
-----done-----
180.6  ms elapsed time
------------------------------> total refactored nodes:  3985
Saint Kitts and Nevis nodes: 4161
{'relationships_created': 2000}
--next batch--
{'relationships_created': 2000}
--next batch--
{'relationships_created': 161}
-----done-----
157.5  ms elapsed time
------------------------------> total refactored nodes:  8146
Egypt nodes: 3980
{'relationships_created': 2000}
--next batch--
{'relationships_created': 1980}
-----done-----
124.0  ms elapsed time
------------------------------> total refactored nodes:  12126
Saudi Arabia nodes: 4131
{'relationships_created': 2000}
--next batch--
{'relationships_created': 2000}
--next batch--
{'relationships_created': 131}
-