In [54]:
#STEP 1 : Generate fake data using GraphAware graphgen
# https://github.com/graphaware/neo4j-graphgen-procedure
# you will need to compile the graphgen .jar file and add it to Neo4j/plugins and restart Neo4j
# (tip: update to JDK 8)

#!pip install neo4j-driver

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)

session = driver.session()


generate1 = '''
CALL generate.nodes('Organization', '{name: companyName, country: country}', 1000000)
'''

session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(generate1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()




processing...
<neo4j.v1.session.ResultSummary object at 0x17fb0c048>
{}
94686.1  ms elapsed time
-----------------


In [55]:
#STEP 2:  Set index on child property and constraint on parent category

#!pip install neo4j-driver

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)

session = driver.session()


index1 = '''
CREATE INDEX ON :Organization(country)
'''

constraint1 = '''
CREATE CONSTRAINT ON (n:Country) ASSERT n.countryName IS UNIQUE;
'''

session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(index1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()


session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(constraint1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()



processing...
<neo4j.v1.session.ResultSummary object at 0x1078017f0>
{'indexes_added': 1}
42.2  ms elapsed time
-----------------
processing...
<neo4j.v1.session.ResultSummary object at 0x107801b00>
{'constraints_added': 1}
990.2  ms elapsed time
-----------------


In [57]:
#STEP 3: (Optional)  Warmup Page Cache, helpful for large graphs
# requires APOC procedures
# https://neo4j-contrib.github.io/neo4j-apoc-procedures/

#!pip install neo4j-driver

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)

session = driver.session()


warmup1 = '''
CALL apoc.warmup.run();
'''

t0 = time.time()
print("processing...")

result = session.run(warmup1)

for record in result:
    print("%s" % (record)) 

summary = result.consume()
counters = summary.counters
print(counters)

print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')

session.close()

processing...
<Record pageSize=8192 nodesPerPage=546 nodesTotal=1000000 nodesLoaded=1832 nodesTime=0 relsPerPage=240 relsTotal=0 relsLoaded=0 relsTime=0 totalTime=0>
{}
245.3  ms elapsed time
-----------------


In [58]:
# STEP 4: Extract and create parent Category nodes

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)

session = driver.session()

cleanup1 = '''
MATCH (n:Country) DETACH DELETE n
'''

# extract from a 5% random sample
extractCategory1 = '''
MATCH (n:Organization) WHERE rand() < 0.05
WITH COLLECT(DISTINCT n.country) AS names
FOREACH (name IN names |
 MERGE (:Country {countryName: name}))
RETURN names
'''


# or extract from a full scan
extractCategory2 = '''
MATCH (n:Organization)
WITH COLLECT(DISTINCT n.country) AS names
FOREACH (name IN names |
 MERGE (:Country {countryName: name}))
RETURN names
'''


session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(cleanup1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()


session = driver.session()
t0 = time.time()
print("processing...")
result = session.run(extractCategory1)
summary = result.consume()
counters = summary.counters
print(summary)
print(counters)
print(round((time.time() - t0)*1000,1), " ms elapsed time")
print('-----------------')
session.close()



processing...
<neo4j.v1.session.ResultSummary object at 0x1093abda0>
{}
42.3  ms elapsed time
-----------------
processing...
<neo4j.v1.session.ResultSummary object at 0x109360588>
{'labels_added': 244, 'properties_set': 244, 'nodes_created': 244}
751.6  ms elapsed time
-----------------


In [59]:
# STEP 5: Refactor Children to Parent Category, in batches by indexes on both
# Bolt variant: using session, implicit transactions

import time

from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ON_FIRST_USE, CypherError

driver = GraphDatabase.driver("bolt://localhost",
                              auth=basic_auth("neo4j", "neo4j"),
                              encrypted=False,
                              trust=TRUST_ON_FIRST_USE)
session = driver.session()

# get counts of child nodes by property that needs refactoring
childProps1 = '''
MATCH (n:Organization)
WHERE NOT ((n)-[:HAS_LOCATION]-())
RETURN n.country AS pname, count(n) AS ncount
'''

# refactor, passing in the property value to both parent and child as paramters to restrict cartesian
indexRefactor1 = '''
MATCH (c:Country {countryName:{pname}}), (n:Organization {country:{pname}})
WHERE NOT ((n)-[:HAS_LOCATION]-())
WITH c,n LIMIT {limit} MERGE (c)<-[r:HAS_LOCATION]-(n)
'''

ntotal = 0

# configure batch size
batchSize = 2000


try:
    tjob = time.time()
    
    #get category and child node counts
    print("processing ---> getting child property list for refactoring")
    
    propertyList = session.run(childProps1)
    
    print('-----------------')
    
    # refactor, stepping through category values in batches per child node counts
    print("processing ---> starting refactoring to parent category")
    
    print ("%s %s" % ("using batch size: ", batchSize))
    
    print('-----------------')
    
    for property in propertyList:
        print("%s %s %s" % (property["pname"], "nodes:", property["ncount"]))
        t0 = time.time()
        btotal=0
        pname = (property["pname"])
        ncount = (property["ncount"])
        
        while True:
            # refactor, passing in the property value to match both parent and child on indexes
            result = session.run(indexRefactor1, {"pname": pname, "limit": batchSize})
            btotal = btotal + batchSize
            summary = result.consume()
            counters = summary.counters
            print(counters)
            
            # compare running total of batched nodes to total nodes
            if btotal < ncount:
                print('--next batch--')
                
            # end batches, go to next parent category value
            else:
                print('-----done-----')
                break
                    
        print(round((time.time() - t0)*1000,1), " ms elapsed time")
        ntotal = ntotal + ncount
        print ("%s %s" % ("------------------------------> total refactored nodes: ", ntotal))
        
                
except Exception as e:
    print('*** Got exception',e)
    if not isinstance(e, CypherError):
        print('*** Rolling back')
        session.rollback()
    else:
        print('*** Not rolling back')
    
finally:
    print('Done!')
    print(round((time.time() - tjob)/60,1), " minutes elapsed time")

processing ---> getting child property list for refactoring
-----------------
processing ---> starting refactoring to parent category
using batch size:  2000
-----------------
Azerbaijan nodes: 3948
{'relationships_created': 2000}
--next batch--
{'relationships_created': 1948}
-----done-----
523.9  ms elapsed time
------------------------------> total refactored nodes:  3948
Saint Kitts and Nevis nodes: 3987
{'relationships_created': 2000}
--next batch--
{'relationships_created': 1987}
-----done-----
195.0  ms elapsed time
------------------------------> total refactored nodes:  7935
Egypt nodes: 4167
{'relationships_created': 2000}
--next batch--
{'relationships_created': 2000}
--next batch--
{'relationships_created': 167}
-----done-----
213.1  ms elapsed time
------------------------------> total refactored nodes:  12102
Saudi Arabia nodes: 4088
{'relationships_created': 2000}
--next batch--
{'relationships_created': 2000}
--next batch--
{'relationships_created': 88}
-----done-----
1