<a href="https://colab.research.google.com/github/chanhodchang/Amazon_Reviews_ETL/blob/master/AWS_ME_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [2]:
# Connect to postgresql
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2020-03-17 23:05:45--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2020-03-17 23:05:46 (4.63 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [0]:
# Start a SparkSession
import findspark
findspark.init()
import pyspark

# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AWS_Mobile_Reviews").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [0]:
!wget -q https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz
!gunzip amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz

In [5]:
# Load in a sql function to use columns
from pyspark.sql.functions import col
import pyspark.sql.functions as F 

from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
mobile_electronics_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz"), sep="\t", header=True)

# Show DataFrame
mobile_electronics_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|  product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   20422322| R8MEA6IGAHO0B|B00MC4CED8|     217304173|BlackVue DR600GW-PMP|Mobile_Electronics|          5|            0|          0|   N|                Y|         Very Happy!|As advertised. Ev...| 2015-08-31|
|         US|   40835037|R31LOQ8JGLPRLK|B00OQMFG1Q|     137313254|GENSSI GSM / GPS ...|Mobile_Electronics|      

In [0]:
# Import struct fields that we can use
from pyspark.sql.types import StructField, StringType, IntegerType, StructType, DateType

In [7]:
schema = [StructField('marketplace', StringType(), True), StructField('customer_id', IntegerType(), True),
          StructField('review_id', StringType(), True), StructField('product_id', StringType(), True),
          StructField('product_parent', IntegerType(), True), StructField('product_title', StringType(), True),
          StructField('product_category', StringType(), True), StructField('star_rating', IntegerType(), True),
          StructField('helpful_votes', IntegerType(), True), StructField('total_votes', IntegerType(), True),
          StructField('vine', StringType(), True), StructField('verified_purchase', StringType(), True),
          StructField('review_headline', StringType(), True), StructField('review_body', StringType(), True),
          StructField('review_date', DateType(), True), ]
schema

[StructField(marketplace,StringType,true),
 StructField(customer_id,IntegerType,true),
 StructField(review_id,StringType,true),
 StructField(product_id,StringType,true),
 StructField(product_parent,IntegerType,true),
 StructField(product_title,StringType,true),
 StructField(product_category,StringType,true),
 StructField(star_rating,IntegerType,true),
 StructField(helpful_votes,IntegerType,true),
 StructField(total_votes,IntegerType,true),
 StructField(vine,StringType,true),
 StructField(verified_purchase,StringType,true),
 StructField(review_headline,StringType,true),
 StructField(review_body,StringType,true),
 StructField(review_date,DateType,true)]

In [8]:
final=StructType(fields=schema)
me_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz"), schema=final, sep="\t", header=True)
me_df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: date (nullable = true)



In [0]:
clean_mobile = me_df.dropna()

In [10]:
review_id_table = clean_mobile.select(['review_id','customer_id','product_id','product_parent','review_date'])
review_id_table.show()

+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
| R8MEA6IGAHO0B|   20422322|B00MC4CED8|     217304173| 2015-08-31|
|R31LOQ8JGLPRLK|   40835037|B00OQMFG1Q|     137313254| 2015-08-31|
|R2Y0MM9YE6OP3P|   51469641|B00QERR5CY|      82850235| 2015-08-31|
| RRB9C05HDOD4O|    4332923|B00QUFTPV4|     221169481| 2015-08-31|
|R26I2RI1GFV8QG|   44855305|B0067XVNTG|     563475445| 2015-08-31|
| RY8DDL22YG4R5|    7846966|B00KA6CCVY|     713636156| 2015-08-31|
|R2AT2426ZHFUHH|   21299354|B00MJCDPM2|     754171872| 2015-08-31|
|R3RRXU2R23NMQ9|   28902968|B00ET5AWBY|     508373185| 2015-08-31|
|R250PR8VJUZ62F|    2074438|B00Z9K379C|     759387370| 2015-08-31|
| RBEMQ29WJBHYG|   48701722|B00YO3UYXW|     913911909| 2015-08-31|
|R19VVIUT4BZCMT|    2990247|B011W26BU2|     938399424| 2015-08-31|
|R1DT8JJUQHUKUL|   52946117|B00H8PI78C|     661515294| 2015-08

In [11]:
products = clean_mobile.select(['product_id','product_title'])
products =products.dropDuplicates(['product_id'])
products.show()

+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|B0006BFQUM|Eminence American...|
|B0009YV3U8|Pyle PLCAP60HB 6 ...|
|B000A50URS|Panasonic Digital...|
|B000AAK0TG|iSkin eVo2 (Vamp)...|
|B000BM9FLC|Visual Land 1 GB ...|
|B000EK5VJ6|8 Co-Axial MARINE...|
|B000FT66RM|Blue Rebel Case H...|
|B000GTPNF2|1.5" LCD 1 GB MP3...|
|B000J3A5EO|Memorex MMP8750 5...|
|B000KPP4MY|Technical Pro TP ...|
|B000LU9ROE|PACIFIC ACC CORP....|
|B000VUPSGU|Car Vent Swivel M...|
|B000WOEONS|HX750s Floating H...|
|B000ZIKFNY|PCMICROSTORE Bran...|
|B00104XCRS|eForCity White Tr...|
|B0012GYX7C|KICKER 07DS650 DS...|
|B0014XVHJK|Naxa 2 Piece USB ...|
|B00164H7MS|Ge 51135 Z-Wave -...|
|B00166KTIU|Microsoft Zune 30...|
|B0018MHSDG|I.SOUND DGIPOD-17...|
+----------+--------------------+
only showing top 20 rows



In [12]:
customers = clean_mobile.groupby('customer_id').count().select('customer_id', col('count').alias('customer_count'))
customers.show()

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|     176073|             1|
|   20992492|             1|
|    1397686|             1|
|   50371142|             1|
|     235096|             1|
|   12320806|             1|
|   19707203|             1|
|   15207172|             1|
|     538703|             1|
|   11223135|             1|
|   31447151|             1|
|   35961247|             1|
|   17835519|             1|
|   15404885|             1|
|   10182610|             1|
|   52399092|             1|
|   17133428|             1|
|   27870423|             1|
|   11574168|             1|
|   30685607|             1|
+-----------+--------------+
only showing top 20 rows



In [14]:
vine_table = clean_mobile.select(['review_id','star_rating','helpful_votes','total_votes','vine'])
vine_table.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
| R8MEA6IGAHO0B|          5|            0|          0|   N|
|R31LOQ8JGLPRLK|          5|            0|          1|   N|
|R2Y0MM9YE6OP3P|          5|            0|          0|   N|
| RRB9C05HDOD4O|          4|            0|          0|   N|
|R26I2RI1GFV8QG|          2|            0|          0|   N|
| RY8DDL22YG4R5|          3|            0|          1|   N|
|R2AT2426ZHFUHH|          3|            0|          1|   N|
|R3RRXU2R23NMQ9|          5|            0|          0|   N|
|R250PR8VJUZ62F|          4|            0|          2|   N|
| RBEMQ29WJBHYG|          5|          164|        168|   N|
|R19VVIUT4BZCMT|          5|            2|          2|   N|
|R1DT8JJUQHUKUL|          3|            0|          1|   N|
| RLIL6S9OGM3YA|          5|            0|          0|   N|
|R34EZZ68VYPHO0|          1|            

In [15]:
non_vine_df = vine_table.filter(col('vine')=='N')
non_vine_df.orderBy(non_vine_df['helpful_votes'].desc()).select('star_rating','helpful_votes','total_votes').show()

+-----------+-------------+-----------+
|star_rating|helpful_votes|total_votes|
+-----------+-------------+-----------+
|          5|          769|        791|
|          3|          435|        448|
|          5|          425|        429|
|          1|          415|        427|
|          1|          323|        337|
|          5|          310|        318|
|          5|          306|        314|
|          1|          293|        301|
|          4|          288|        294|
|          4|          284|        291|
|          4|          280|        284|
|          1|          280|        287|
|          5|          268|        278|
|          1|          257|        311|
|          1|          257|        268|
|          4|          252|        262|
|          5|          250|        293|
|          4|          248|        255|
|          4|          246|        294|
|          5|          237|        246|
+-----------+-------------+-----------+
only showing top 20 rows



In [16]:
paid_vine_df = vine_table.filter(col('vine')=='Y')
paid_vine_df.orderBy(paid_vine_df['helpful_votes'].desc()).select('star_rating','helpful_votes','total_votes').show()

+-----------+-------------+-----------+
|star_rating|helpful_votes|total_votes|
+-----------+-------------+-----------+
|          3|          396|        445|
|          5|          242|        281|
|          4|           42|         55|
|          4|           31|         41|
|          4|           11|         14|
|          5|            8|         11|
|          5|            6|         10|
|          4|            5|          9|
|          2|            4|          7|
|          5|            3|          6|
|          3|            2|         18|
|          5|            2|          5|
|          4|            1|          2|
|          3|            1|          2|
|          3|            0|          0|
|          5|            0|          0|
|          4|            0|          0|
|          2|            0|          3|
+-----------+-------------+-----------+



In [17]:
non_vine_df.filter('star_rating = 5').select('helpful_votes').count()

52247

In [18]:
paid_vine_df.filter('star_rating = 5').select('helpful_votes').count()

6

In [19]:
non_vine_avg= non_vine_df.groupBy('star_rating').avg()
non_vine_avg.orderBy(non_vine_avg['star_rating'].desc()).select('star_rating','avg(helpful_votes)','avg(total_votes)').show()

+-----------+------------------+------------------+
|star_rating|avg(helpful_votes)|  avg(total_votes)|
+-----------+------------------+------------------+
|          5| 1.180909908702892|1.4250770379160527|
|          4|1.2654573609114037|1.4955757106514767|
|          3|1.1515930113052415| 1.532168550873587|
|          2| 1.169380216171843|1.7451087700095773|
|          1|1.4519504151029228|2.2470146707608323|
+-----------+------------------+------------------+



In [20]:
paid_vine_avg = paid_vine_df.groupBy('star_rating').avg()
paid_vine_avg.orderBy(non_vine_avg['star_rating'].desc()).select('star_rating','avg(helpful_votes)','avg(total_votes)').show()

+-----------+------------------+------------------+
|star_rating|avg(helpful_votes)|  avg(total_votes)|
+-----------+------------------+------------------+
|          5|              43.5|52.166666666666664|
|          4|              15.0|20.166666666666668|
|          3|             99.75|            116.25|
|          2|               2.0|               5.0|
+-----------+------------------+------------------+



In [23]:
non_vine_df.select('review_id').count()

104954

In [24]:
paid_vine_df.select('review_id').count()

18

In [25]:
non_vine_df.select('helpful_votes').count()

104954

In [26]:
paid_vine_df.select('helpful_votes').count()

18

In [0]:
# Configure setting for RDS
mode = 'append'
jdbc_url="jdbc:postgresql://dataviz.cbp360boojkd.us-east-2.rds.amazonaws.com:5431/AWS_Mobile_Reviews"
config = {"user":"postgres",
          "password": "Dc909263",
          "driver":"org.postgresql.Driver"}

In [0]:
# Write DataFrame to review_id_table table in RDS
review_id_table.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)

In [0]:
# Write DataFrame to products table in RDS
products.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=config)

In [0]:
# Write DataFrame to customers table in RDS
customers.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=config)

In [0]:
# Write DataFrame to vine_table table in RDS
vine_table.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)