# Load dataset to server

## Set up a couple variables
- doLocally determines where or not to run the server locally or at the location specified in the graphpass file
- startFresh determines whether to delete all the nodes before running the code
- fLoc determine where to get the files from (more details below)
- there are three csv files and you can either use local files or dropbox files (I'll update later to hide the files)

In [1]:
# a couple of graph setup options
doLocally = False
startFresh = False

# input files' location
# fLoc = "file:///Users/danny/Repos/csci5980_graph_database/data/" # windows
# fLoc = "file:////home/danny/Repos/csci5980_graph_database/data/" # linux
# fLoc = "file:///" # with standard options
fLoc = "" # doing it from dropbox

# input csv files - local
# productCsvName = "products.csv"
# reviewCsvName = "reviews.csv"
# peopleCsvName = "people.csv"
# input csv files - dropbox
peopleCsvName = "https://dl.dropbox.com/s/wepmkxqzt6xk0nk/people.csv"
productCsvName = "https://dl.dropbox.com/s/fv6b0vob2sbwlai/products.csv"
reviewCsvName = "https://dl.dropbox.com/s/orm7em01c27mmgi/reviews.csv"


## Import the necessary libraries and start the graph connection

In [2]:
import time
from py2neo import Graph

if doLocally:
    server = "http://neo4j:bigdata@localhost:7474/db/data/"
else:
    with open("graphpass.txt") as f:
        server = f.read()
print(server)
graph = Graph(server)
if startFresh:
    graph.delete_all()

http://neo4j:AgentSmith@34.195.7.211:7474/db/data/


## Write the strings for the load queries

In [4]:
# write query statements
# statements asserting uniqueness
q1 = """
create constraint on (pe:Person) assert pe.id is unique;
"""
q2 = """
create constraint on (pr:Product) assert pr.id is unique;
"""

# load reviews
fnRev = fLoc + reviewCsvName
qRev = """
using periodic commit 1000
load csv with headers from "%s" as row
merge (person:Person {id:row.reviewerID})
merge (product:Product {id:row.asin})
create (person)-[:Reviewed {ts:row.ts, reviewText:row.reviewText, score:row.score, summary:row.summary, helpful0:row.helpful0, helpful1:row.helpful1}]->(product);
""" % fnRev

# load people
fnPpl = fLoc + peopleCsvName
qPpl = """
using periodic commit 1000
load csv with headers from "%s" as row
match(person:Person {id:row.reviewerID})
set person.name = row.name;
""" % fnPpl

# load products
fnProd = fLoc + productCsvName
qProd = """
using periodic commit 1000
load csv with headers from "%s" as row
match(product:Product {id:row.asin})
set product.name = row.name, product.price = row.price, product.imUrl = row.imUrl,
    product.brand = row.brand, product.rankCat = row.rankCat, product.rank = row.rank,
    product.categories = row.categories;
""" % fnProd

## Run the queries
If using the whole k-cores dataset (40 million reviews, 9 million products), it should take roughly 3 hours. Or at least, it took that long on my i5-processor desktop

In [5]:
# run all the queries
# run constraints
graph.run(q1)
graph.run(q2)

# from py2neo import watch
# watch("neo4j.bolt")
# run reviews first - it creates all the nodes
inittime = time.time()
sttime = time.time()
graph.run(qRev)
print("reviews", time.time()-sttime)
with open("/home/danny/Dropbox/bigdata/revdone.txt", "w") as f:
    f.write(str(time.time()-sttime))


# add names to people
sttime = time.time()
graph.run(qPpl)
print("people", time.time()-sttime)
with open("/home/danny/Dropbox/bigdata/ppldone.txt", "w") as f:
    f.write(str(time.time()-sttime))

# add misc information to products
sttime = time.time()
graph.run(qProd)
print("products", time.time()-sttime)
finalleng = time.time() - inittime
with open("/home/danny/Dropbox/bigdata/proddone.txt", "w") as f:
    f.write(str(time.time()-sttime))
    f.write("\n")
    f.write(str(finalleng))

ConnectionAbortedError: [WinError 10053] An established connection was aborted by the software in your host machine