# Convert dataset to csv

In [None]:
import json
import csv
import gzip
import time
import re
import ast
from py2neo import Graph

In [None]:
# create product csv
fname = "data/metadata.json.gz"
csvProducts = open('data/products.csv', 'w')
prod = csv.writer(csvProducts)
prod.writerow(["asin", "name", "price", "imUrl", "brand", "categories", "rankCat", "rank"])
prodSet = set()

# open gzip json and write
sttime = time.time() # time the process
count = 0
with gzip.open(fname, "r") as f:
    for line in f:
        ln = line.decode("ascii")
        ln = re.sub("\n", "", ln)
        d = ast.literal_eval(ln)
        if d.get("asin") not in prodSet:
            prodSet.add(d.get("asin"))
            tmpAs = d.get("asin")
            if tmpAs != None:
                tmpAs = re.sub("\n", " ", tmpAs)
                tmpAs = tmpAs.replace("\\", "")
                tmpAs = tmpAs.replace(",", "")
            sr = d.get("salesRank")
            if sr == None or len(sr) == 0:
                sr = {"NA": 0}
            sr2 = [list(sr.keys())[0], list(sr.values())[0]]
            nm = d.get("tmp")
            if nm != None:
                nm = re.sub("\n", " ", nm)
                nm = nm.replace("\\", "")
                nm = nm.replace(",", "")
            ti = d.get("title")
            if ti != None:
                ti = re.sub("\n", " ", ti)
                ti = ti.replace("\\", "")
                ti = ti.replace(",", "")
                ti = ti.replace("\"", "")
                ti = ti.replace("\'", "")
            prod.writerow([tmpAs, ti, d.get("price"), d.get("imUrl"),
                          d.get("brand"), d.get("categories"),
                          sr2[0], sr2[1]])
        count += 1
        if count % 100000 == 0:
            print(count)
            prod.flush()
#         if count > 10000: break
csvProducts.close()
print(count, time.time()-sttime)


In [None]:
# choose which file to use
fname = "data/kcore_5.json.gz"

# create csv writers
csvPeople = open('data/people.csv', 'w')
ppl = csv.writer(csvPeople)
ppl.writerow(["reviewerID", "name"])

csvReviews = open('data/reviews.csv', 'w')
rev = csv.writer(csvReviews)
rev.writerow(["reviewerID", "score", "reviewText", "summary", "helpful0",
              "helpful1", "ts", "asin"])

# create sets for ensuring only unique items
pplSet = set()

# open gzip json and write
sttime = time.time() # time the process
count = 0
with gzip.open(fname, "r") as f:
    for line in f:
        ln = line.decode("ascii")
        d = json.loads(ln)
        
        # add person
        if d.get("reviewerID") not in pplSet:
            pplSet.add(d.get("reviewerID"))
            tempNm = d.get("reviewerName")
            if tempNm != None:
                tempNm = re.sub("\n", " ", tempNm)
                tempNm = tempNm.replace("\\", "")
                tempNm = tempNm.replace(",", "")
            ppl.writerow([d.get("reviewerID"), tempNm])
        # add review
        tr = d.get("reviewText")
        tsu = d.get("summary")
        if tr != None:
            tr = re.sub("\n", " ", tr)
            tr = tr.replace("\\", "")
            tr = tr.replace(",", "")
        if tsu != None:
            tsu = re.sub("\n", " ", tsu)
            tsu = tsu.replace("\\", "")
            tsu = tsu.replace(",", "")
        rev.writerow([d.get("reviewerID"), d.get("overall"), tr, tsu,
                      d.get("helpful")[0], d.get("helpful")[1],
                      d.get("unixReviewTime"), d.get("asin")])
        count += 1
        if count % 100000 == 0:
            print(count)
            rev.flush()
            ppl.flush()
#         if count > 10000: break
print(count, time.time()-sttime)

csvPeople.close()
csvReviews.close()

In [None]:
import shutil
shutil.copy("data/people.csv", "C:/Users/danny/Dropbox/bigdata/")
shutil.copy("data/products.csv", "C:/Users/danny/Dropbox/bigdata/")
shutil.copy("data/reviews.csv", "C:/Users/danny/Dropbox/bigdata/")

# Load dataset to server

In [None]:
graph = Graph("http://neo4j:bigdata@localhost:7474/db/data/")
# graph = Graph("http://neo4j:AgentSmith@34.236.229.56:7474/db/data/")
# @ whatever ip address amazon is on
graph.delete_all()

In [None]:
# write query statements
# fLoc = "file:///Users/danny/Repos/csci5980_graph_database/data/"
fLoc = "file:///"
# statements asserting uniqueness
q1 = """
create constraint on (pe:Person) assert pe.id is unique;
"""
q2 = """
create constraint on (pr:Product) assert pr.id is unique;
"""

# load people
# fnPpl = fLoc + "people.csv"
fnPpl = "https://dl.dropbox.com/s/8wnfq7c7ppkvxbc/people.csv"
qPpl = """
using periodic commit 10000
load csv with headers from "%s" as row
match(person:Person {id:row.reviewerID})
set person.name = row.name;
""" % fnPpl

# load products
# fnProd = fLoc + "products.csv"
fnProd = "https://dl.dropbox.com/s/scvk789n2xx0wcx/products.csv"
qProd = """
using periodic commit 10000
load csv with headers from "%s" as row
match(product:Product {id:row.asin})
set product.name = row.name, product.price = row.price, product.imUrl = row.imUrl,
    product.brand = row.brand, product.rankCat = row.rankCat, product.rank = row.rank,
    product.categories = row.categories;
""" % fnProd

# load reviews
# fnRev = fLoc + "reviews.csv"
fnRev = "https://dl.dropbox.com/s/14eomoh7y229tjb/reviews.csv"
qRev = """
using periodic commit 10000
load csv with headers from "%s" as row
with row limit 10000000
merge (person:Person {id:row.reviewerID})
merge (product:Product {id:row.asin})
create (person)-[:Reviewed {ts:row.unixReviewTime, reviewText:row.reviewText, score:row.overall, summary:row.summary}]->(product);
""" % fnRev

In [None]:
# run all the queries
sttime = time.time()
graph.run(q1)
graph.run(q2)

graph.run(qRev)
print("reviews", time.time()-sttime)
graph.run(qPpl)
print("people", time.time()-sttime)
graph.run(qProd)
print("products", time.time()-sttime)

In [None]:
# with open("data/products_old.csv", "r") as f:
#     re = csv.reader(f)
#     count = 0
#     for line in re:
#         count += 1
# #         print(line)
# #         if count > 10: break
#         if "LG C" in line:
#             print(line)
# #             break
# print(count)