## Useful links
https://neo4j.com/developer/cypher-query-language/  
https://neo4j.com/docs/operations-manual/current/introduction/  
http://py2neo.org/v3/index.html

Start commands in linux (one of these works):  
sudo systemctl {start|stop|restart} neo4j  
sudo service neo4j start  
sudo neo4j start

## Create csv for load csv

In [None]:
import json
import csv
import gzip
import time
import re
import ast
from py2neo import Graph

In [None]:
# choose which file to use
fname = "data/kcore_5.json.gz"

# create csv writers
csvPeople = open('data/people.csv', 'w')
ppl = csv.writer(csvPeople)
ppl.writerow(["reviewerID", "name"])

csvReviews = open('data/reviews.csv', 'w')
rev = csv.writer(csvReviews)
rev.writerow(["reviewerID", "score", "reviewText", "summary", "helpful0",
              "helpful1", "ts", "asin"])

# create sets for ensuring only unique items
pplSet = set()

# open gzip json and write
sttime = time.time() # time the process
count = 0
with gzip.open(fname, "r") as f:
    for line in f:
        ln = line.decode("ascii")
        d = json.loads(ln)
        
        # add person
        if d.get("reviewerID") not in pplSet:
            pplSet.add(d.get("reviewerID"))
            tempNm = d.get("reviewerName")
            if tempNm != None:
                tempNm = re.sub("\n", " ", tempNm)
                tempNm = tempNm.replace("\\", "")
                tempNm = tempNm.replace(",", "")
            ppl.writerow([d.get("reviewerID"), tempNm])
        # add review
        tr = d.get("reviewText")
        tsu = d.get("summary")
        if tr != None:
            tr = re.sub("\n", " ", tr)
            tr = tr.replace("\\", "")
            tr = tr.replace(",", "")
        if tsu != None:
            tsu = re.sub("\n", " ", tsu)
            tsu = tsu.replace("\\", "")
            tsu = tsu.replace(",", "")
        rev.writerow([d.get("reviewerID"), d.get("overall"), tr, tsu,
                      d.get("helpful")[0], d.get("helpful")[1],
                      d.get("unixReviewTime"), d.get("asin")])
        count += 1
        if count % 100000 == 0:
            print(count)
#         if count > 10000: break
print(count, time.time()-sttime)

csvPeople.close()
csvReviews.close()

In [None]:
# create product csv
fname = "data/metadata.json.gz"
csvProducts = open('data/products.csv', 'w')
prod = csv.writer(csvProducts)
prod.writerow(["asin", "name", "price", "imUrl", "brand", "categories", "rankCat", "rank"])
prodSet = set()

# open gzip json and write
sttime = time.time() # time the process
count = 0
with gzip.open(fname, "r") as f:
    for line in f:
        ln = line.decode("ascii")
        ln = re.sub("\n", "", ln)
        d = ast.literal_eval(ln)
        if d.get("asin") not in prodSet:
            prodSet.add(d.get("asin"))
            tmpAs = d.get("asin")
            if tmpAs != None:
                tmpAs = re.sub("\n", " ", tmpAs)
                tmpAs = tmpAs.replace("\\", "")
                tmpAs = tmpAs.replace(",", "")
            sr = d.get("salesRank")
            if sr == None or len(sr) == 0:
                sr = {"NA": 0}
            sr2 = [list(sr.keys())[0], list(sr.values())[0]]
            nm = d.get("tmp")
            if nm != None:
                nm = re.sub("\n", " ", nm)
                nm = nm.replace("\\", "")
                nm = nm.replace(",", "")
            ti = d.get("title")
            if ti != None:
                ti = re.sub("\n", " ", ti)
                ti = ti.replace("\\", "")
                ti = ti.replace(",", "")
                ti = ti.replace("\"", "")
                ti = ti.replace("\'", "")
            prod.writerow([tmpAs, ti, d.get("price"), d.get("imUrl"),
                          d.get("brand"), d.get("categories"),
                          sr2[0], sr2[1]])
        count += 1
        if count % 100000 == 0:
            print(count)
#         if count > 10000: break
csvProducts.close()
print(count, time.time()-sttime)


In [None]:
# possible regex for above
#         print(d)
#         print(ln)
#         ln = re.sub("'(.+?)'([:|,|\}|\]])", '"\\1"\\2', ln)
#         ln = ln.replace("\"", "'")
#         ln = re.sub("'(.+?)'([:|,|\}|\]])", "\"\\1\"\\2", ln)
        
#         ln = re.sub("([a-zA-Z])\'([a-zA-Z])", "\\1\\2", ln)
#         ln = ln.replace("'", "\"")
#         d = json.loads(ln)
# the creator's way of doing things
# def parse(path):
#   g = gzip.open(path, 'r')
#   for l in g:
#     yield eval(l)

## Import data with load csv

In [None]:
import time
from py2neo import Graph
graph = Graph("http://neo4j:bigdata@localhost:7474/db/data/")
# graph = Graph("http://neo4j:AgentSmith@34.236.229.56:7474/db/data/")
# @ whatever ip address amazon is on
# graph.delete_all()

In [None]:
# write query statements
# fLoc = "file:///Users/danny/Repos/csci5980_graph_database/data/"
fLoc = "file:////home/danny/Repos/csci5980_graph_database/data/"
# fLoc = "file:///"

# statements asserting uniqueness
q1 = """
create constraint on (pe:Person) assert pe.id is unique;
"""
q2 = """
create constraint on (pr:Product) assert pr.id is unique;
"""

# load people
fnPpl = fLoc + "people.csv"
# fnPpl = "https://dl.dropbox.com/s/8wnfq7c7ppkvxbc/people.csv"
qPpl = """
using periodic commit 10000
load csv with headers from "%s" as row
match(person:Person {id:row.reviewerID})
set person.name = row.name;
""" % fnPpl

# load products
fnProd = fLoc + "products.csv"
# fnProd = "https://dl.dropbox.com/s/scvk789n2xx0wcx/products.csv"
qProd = """
using periodic commit 10000
load csv with headers from "%s" as row
match(product:Product {id:row.asin})
set product.name = row.name, product.price = row.price, product.imUrl = row.imUrl,
    product.brand = row.brand, product.rankCat = row.rankCat, product.rank = row.rank,
    product.categories = row.categories;
""" % fnProd

# load reviews
fnRev = fLoc + "reviews.csv"
# fnRev = "https://dl.dropbox.com/s/14eomoh7y229tjb/reviews.csv"
qRev = """
using periodic commit 10000
load csv with headers from "%s" as row
merge (person:Person {id:row.reviewerID})
merge (product:Product {id:row.asin})
create (person)-[:Reviewed {ts:row.unixReviewTime, reviewText:row.reviewText, score:row.overall, summary:row.summary}]->(product);
""" % fnRev

In [None]:
# run all the queries
sttime = time.time()
graph.run(q1)
graph.run(q2)

graph.run(qRev)
print("reviews", time.time()-sttime)
graph.run(qPpl)
print("people", time.time()-sttime)
graph.run(qProd)
print("products", time.time()-sttime)

## Playing with py2neo Node creation

In [None]:
# simple example of adding nodes
from py2neo import Graph, Path, Node, NodeSelector, Relationship
import numpy as np
# graph = Graph("http://neo4j:bigdata@localhost:7474/db/data/")
graph = Graph("http://neo4j:AgentSmith@34.236.229.56:7474/db/data/")
# graph.delete_all()

In [None]:
for name in ["alex", "bob", "carol", "devon", "emily", "frank", "greg", "hank", "ingrid", 
             "jerrodd", "kelly", "louis", "molly", "nick", "ollie", "patricia", "quinn",
             "rachel", "steve", "tom", "ulrich", "vance", "wendy", "xander", "yolanda", "zed"]:
    per = Node("person", name=name, title="peasant")
    graph.create(per)
Sir_Daniel = Node("person", name="Sir Daniel Jon Hanson III", title="Archduke of Stuffington")
graph.create(Sir_Daniel)

selector = NodeSelector(graph)
people = selector.select("person")
for person in people:
#     print(person)
    if person.get("name") != Sir_Daniel.get("name"):
        rel = Relationship(person, "pays_taxes_to", Sir_Daniel)
        rel["amount"] = list(np.random.uniform(1, 100, 1))
#         print(rel)
        graph.create(rel)

In [None]:
# the slow python way of importing everything
def createReview(line):
    d = json.loads(line)
    per = Node("person", id=d.get("reviewerID"))
    per["name"] = d.get("reviewerName")

    prod = Node("product", asin=d.get("asin"))

    rev = Relationship(per, "reviewed", prod)
    rev["ts"] = d.get("unixReviewTime")
    rev["reviewText"] = d.get("reviewText")
    rev["score"] = d.get("overall")
    rev["summary"] = d.get("summary")
    return rev

fname = "data/Musical_Instruments_5.json"
sttime = time.time()
with open(fname, "r") as revFile:
    count = 0
    tx = graph.begin()
    for line in revFile:
        rev = createReview(line)
        tx.merge(rev)
        tx.commit()
        tx = graph.begin()
        
        count += 1
        if count % 1000 == 0:
            print(count, time.time()-sttime)
        if count % 10000 == 0:
            print(count, time.time()-sttime)
            tx.commit()
            tx = graph.begin()
    if count % 10000 != 0:
        tx.commit()
print("Done", time.time()-sttime)

sttime = time.time()
with open(fname, "r") as revFile:
    count = 0
    revList = []
    for line in revFile:
        rev = createReview(line)
        revList.append(rev)
        count += 1
        if count % 1000 == 0:
            print(count, time.time()-sttime)
        if count % 10000 == 0:
            tx = graph.begin()
            for rev in revList:
                tx.merge(rev)
            print("Before commit", count, time.time()-sttime)
            tx.commit()
            print("After commit", count, time.time()-sttime)
            revList = []
    if len(revList) > 0:
        tx = graph.begin()
        for rev in revList:
            tx.merge(rev)
        print("Before commit", count, time.time()-sttime)
        tx.commit()
        revList = []
print("Done", time.time()-sttime)

## Create csv for neo4j-admin import

In [None]:
# create csv files for neo4j-admin import statement
import json
import csv
import gzip
import time
import re

# choose which file to use
# fname = "data/reviews_Patio_Lawn_and_Garden_5.json.gz"
# fname = "data/reviews_Pet_Supplies_5.json.gz"
fname = "data/reviews_Health_and_Personal_Care_5.json.gz"

# create csv writers
csvPeople = open('data/people.csv', 'w')
ppl = csv.writer(csvPeople)
ppl.writerow(["reviewerID:ID", "name", ":LABEL"])

csvProducts = open('data/products.csv', 'w')
prod = csv.writer(csvProducts)
prod.writerow(["asin:ID", ":LABEL"])

csvReviews = open('data/reviews.csv', 'w')
rev = csv.writer(csvReviews)
rev.writerow([":START_ID", "score", "reviewText", "summary", "helpful0",
              "helpful1", "ts", ":END_ID", ":TYPE"])

# create sets for ensuring only unique items
pplSet = set()
prodSet = set()

# open gzip json and write
sttime = time.time() # time the process
count = 0
with gzip.open(fname, "r") as f:
    for line in f:
        ln = line.decode("ascii")
        d = json.loads(ln)
        
        # add person
        if d.get("reviewerID") not in pplSet:
            pplSet.add(d.get("reviewerID"))
            tempNm = d.get("reviewerName")
            if tempNm != None:
                tempNm = re.sub("\n", " ", tempNm)
                tempNm = tempNm.replace("\\", "")
                tempNm = tempNm.replace(",", "")
            ppl.writerow([d.get("reviewerID"), tempNm, "Person"])
        # add product (temporary - use metadata later)
        if d.get("asin") not in prodSet:
            prodSet.add(d.get("asin"))
            prod.writerow([d.get("asin"), "Product"])
        # add review
        tr = d.get("reviewText")
        tsu = d.get("summary")
        if tr != None:
            tr = re.sub("\n", " ", tr)
            tr = tr.replace("\\", "")
            tr = tr.replace(",", "")
        if tsu != None:
            tsu = re.sub("\n", " ", tsu)
            tsu = tsu.replace("\\", "")
            tsu = tsu.replace(",", "")
        rev.writerow([d.get("reviewerID"), d.get("overall"), tr, tsu,
                      d.get("helpful")[0], d.get("helpful")[1],
                      d.get("unixReviewTime"), d.get("asin"),
                      "Reviewed"])
        count += 1
print(count, time.time()-sttime)

csvPeople.close()
csvProducts.close()
csvReviews.close()

### official neo4j module

In [None]:
from neo4j.v1 import GraphDatabase, basic_auth

driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "bigdata"))
session = driver.session()

session.run("CREATE (a:Person {name: {name}, title: {title}})",
          {"name": "Arthur", "title": "King"})

result = session.run("MATCH (a:Person) WHERE a.name = {name} "
                   "RETURN a.name AS name, a.title AS title",
                   {"name": "Arthur"})
for record in result:
    print("%s %s" % (record["title"], record["name"]))

session.close()