## Download the Amazon review data

In [4]:
# Get the Amazon reviews data
# More info at https://registry.opendata.aws/amazon-reviews/ and https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt
! wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz && gunzip amazon_reviews_us_Video_Games_v1_00.tsv.gz

--2018-11-01 07:05:07--  https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.1.147
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.1.147|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 475199894 (453M) [application/x-gzip]
Saving to: ‘amazon_reviews_us_Video_Games_v1_00.tsv.gz’


2018-11-01 07:05:20 (36.8 MB/s) - ‘amazon_reviews_us_Video_Games_v1_00.tsv.gz’ saved [475199894/475199894]



## Convert the review data into nodes and edges files for importing into Neptune

In [7]:
import csv
from collections import namedtuple, defaultdict

review_file = "amazon_reviews_us_Video_Games_v1_00.tsv"
review_vertexes_file = "/mnt/data/reviews/video_games_nodes.csv"
review_edges_file = "/mnt/data/reviews/video_games_edges.csv"

bucket_name = "gabehol-graph-datasets"

In [2]:
Customer = namedtuple('Customer', ['id'])
Product = namedtuple('Product', ['id', 'title', 'category'])
Review = namedtuple('Review', ['id', 'customer_id', 'product_id', 'star_rating', 'date'])

customers = set()
products = set()
reviews = set()

In [4]:
with open(review_file) as rf:
    reader = csv.DictReader(rf, delimiter="\t", quotechar='"',quoting=csv.QUOTE_NONE)

    for row in reader:
        customers.add(
            Customer("c"+row["customer_id"])
        )

        products.add(
            Product(
                "p"+row["product_id"],
                row["product_title"],
                row["product_category"]
            )
        )

        reviews.add(
            Review(
                row['review_id'],
                row['customer_id'],
                row['product_id'],
                row['star_rating'],
                row['review_date']
            )
        )

In [5]:
# Write out the nodes and edges files for importing into Neptune
# More info on this format at https://docs.aws.amazon.com/neptune/latest/userguide/bulk-load-tutorial-format-gremlin.html

with open(review_vertexes_file, "w") as f:
    writer = csv.writer(f)
    writer.writerow([
        "~id", 
        "~label",
        "product_title:String",
        "product_category:String",        
    ])

    for customer in customers:
        writer.writerow([
            customer.id,
            "customer",
            "",
            ""]
        )
        
    for product in products:
        writer.writerow([
            product.id,
            "product",
            product.title,
            product.category]
        )
        
with open(review_edges_file, "w") as f:
    writer = csv.writer(f)
    writer.writerow([
        "~id", 
        "~from",
        "~to",        
        "~label",                
        "star_rating:Int",
        "date:Date",        
    ])

    for review in reviews:
        writer.writerow([
            "r" + review.id,
            "c"+review.customer_id,
            "p"+review.product_id,
            "review",
            review.star_rating,
            review.date]
        )

## Load the data into Neptune

In [6]:
# Upload nodes file to S3 for loading into Neptune
! aws s3 cp {review_vertexes_file} s3://{bucket_name}/reviews/video_games_nodes.csv


upload: reviews/video_games_nodes.csv to s3://gabehol-graph-datasets/reviews/video_games_nodes.csv


In [7]:
# Upload edges file to S3 for loading into Neptune
! aws s3 cp {review_edges_file} s3://{bucket_name}/reviews/video_game_edges.csv

upload: reviews/video_games_edges.csv to s3://gabehol-graph-datasets/reviews/video_game_edges.csv


In [28]:
# Info about our neptune cluster
neptune_host='neptunedbcluster-umztptccpt2t.cluster-ct6mcfcdqyrw.us-east-1.neptune.amazonaws.com'
neptune_iam_role='arn:aws:iam::541003905521:role/NeptuneQuickStart-NeptuneSta-NeptuneLoadFromS3Role-T4SA5MUD9VX7'
neptune_load_src=f's3://{bucket_name}/reviews'
neptune_region='us-east-1'

In [33]:
%%bash -s {neptune_host} {neptune_load_src} {neptune_iam_role} {neptune_region}
curl -X POST -H 'Content-Type: application/json' http://$1:8182/loader -d @- << EOF
{
  "source" : "$2",
  "format" : "csv",  
  "iamRoleArn" : "$3", 
  "region" : "$4", 
  "failOnError" : "FALSE"
}
EOF

{
    "status" : "200 OK",
    "payload" : {
        "loadId" : "f11d0b7b-80ba-4d6e-baf6-6a1ca8399156"
    }
}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   346  100   110  100   236    462    991 --:--:-- --:--:-- --:--:--   995
