# Convert dataset to csv

## Input and output file names
- Change the variables below to match the location of data files of interest.
- The cell below should also be where you set the working directory, if necessary
- productInFile should be the metadata file and reviewInFile should be the k-cores file.  
- productCsvName should be the wherever the products file should go (about 3Gb), reviewCsvName should be wherever the reviews go (about 24Gb), and peopleCsvName should be wherever the people file should go (about 90Mb).

In [None]:
# input files
productInFile = "data/metadata.json.gz"
reviewInFile = "data/kcore_5.json.gz"

# output csv files
productCsvName = "data/products.csv"
reviewCsvName = "data/reviews.csv"
peopleCsvName = "data/people.csv"

## Load libraries

In [None]:
import json
import csv
import gzip
import time
import re
import ast
from py2neo import Graph

## Create product csv
Everything here should be ready to go. If you want to test it out on a smaller set uncomment the line near the end with the "break" statement.

In [None]:
# open gzip json and write
sttime = time.time() # time the process
count = 0
with gzip.open(productInFile, "r") as f, open(productCsvName, 'w') as csvProducts:
    # create set for ensuring only unique items
    prodSet = set()
    # create csv writer
    prod = csv.writer(csvProducts)
    prod.writerow(["asin", "name", "price", "imUrl", "brand", "categories", "rankCat", "rank"])
    for line in f:
        ln = line.decode("ascii")
        ln = re.sub("\n", "", ln)
        d = ast.literal_eval(ln)
        if d.get("asin") not in prodSet:
            prodSet.add(d.get("asin"))
            tmpAs = d.get("asin")
            if tmpAs != None:
                tmpAs = re.sub("\n", " ", tmpAs)
                tmpAs = tmpAs.replace("\\", "")
                tmpAs = tmpAs.replace(",", "")
            sr = d.get("salesRank")
            if sr == None or len(sr) == 0:
                sr = {"NA": 0}
            sr2 = [list(sr.keys())[0], list(sr.values())[0]]
            nm = d.get("tmp")
            if nm != None:
                nm = re.sub("\n", " ", nm)
                nm = nm.replace("\\", "")
                nm = nm.replace(",", "")
            ti = d.get("title")
            if ti != None:
                ti = re.sub("\n", " ", ti)
                ti = ti.replace("\\", "")
                ti = ti.replace(",", "")
                ti = ti.replace("\"", "")
                ti = ti.replace("\'", "")
            prod.writerow([tmpAs, ti, d.get("price"), d.get("imUrl"),
                          d.get("brand"), d.get("categories"),
                          sr2[0], sr2[1]])
        count += 1
        if count % 100000 == 0:
            print(count)
#         if count > 10000: break
print(count, time.time()-sttime)


## Create people and review csvs
Everything here should be ready to go. If you want to test it out on a smaller set uncomment the line near the end with the "break" statement.

In [None]:
# open gzip json and write
sttime = time.time() # time the process
count = 0
with gzip.open(reviewInFile, "r") as f, open(peopleCsvName, 'w') as csvPeople, open(reviewCsvName, 'w') as csvReviews:
    # create set for ensuring only unique items
    pplSet = set()
    
    # create csv writers
    ppl = csv.writer(csvPeople)
    ppl.writerow(["reviewerID", "name"])

    rev = csv.writer(csvReviews)
    rev.writerow(["reviewerID", "score", "reviewText", "summary", "helpful0",
                  "helpful1", "ts", "asin"])
    for line in f:
        ln = line.decode("ascii")
        d = json.loads(ln)
        
        # add person
        if d.get("reviewerID") not in pplSet:
            pplSet.add(d.get("reviewerID"))
            tempNm = d.get("reviewerName")
            if tempNm != None:
                tempNm = re.sub("\n", " ", tempNm)
                tempNm = tempNm.replace("\\", "")
                tempNm = tempNm.replace(",", "")
            ppl.writerow([d.get("reviewerID"), tempNm])
        # add review
        tr = d.get("reviewText")
        tsu = d.get("summary")
        if tr != None:
            tr = re.sub("\n", " ", tr)
            tr = tr.replace("\\", "")
            tr = tr.replace(",", "")
        if tsu != None:
            tsu = re.sub("\n", " ", tsu)
            tsu = tsu.replace("\\", "")
            tsu = tsu.replace(",", "")
        rev.writerow([d.get("reviewerID"), d.get("overall"), tr, tsu,
                      d.get("helpful")[0], d.get("helpful")[1],
                      d.get("unixReviewTime"), d.get("asin")])
        count += 1
        if count % 100000 == 0:
            print(count)
#         if count > 10000: break
print(count, time.time()-sttime)

## Copy the data to a Dropbox folder
The files are specified, you just need to change the location of the output folder (dropLocs - yes it's play on words)

In [None]:
import shutil
dropLocs = "C:/Users/danny/Dropbox/bigdata/"
shutil.copy(productCsvName, dropLocs)
shutil.copy(reviewCsvName, dropLocs)
shutil.copy(peopleCsvName, dropLocs)