In [1]:
import os
# Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.2.2'
# spark_version = 'spark-<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.10% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.10% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:6 http://security.ubuntu.com/ubuntu b

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2022-11-22 19:23:05--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2022-11-22 19:23:05 (4.57 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Books").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [4]:
from pyspark import SparkFiles
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_00.tsv.gz'
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get('amazon_reviews_us_Books_v1_00.tsv.gz'), sep='\t', header=True)

In [5]:
df.show(10)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   25933450| RJOVP071AVAJO|0439873800|      84656342|There Was an Old ...|           Books|          5|            0|          0|   N|                Y|          Five Stars|I love it and so ...| 2015-08-31|
|         US|    1801372|R1ORGBETCDW3AI|1623953553|     729938122|      I Saw a Friend|           Books|          5|    

In [6]:
size = df.count()
print(f"Number of rows in DataFrame : {size}")

Number of rows in DataFrame : 10319090


In [7]:
df = df.dropDuplicates().dropna()

In [8]:
size = df.count()
print(f"Number of rows in DataFrame : {size}")

Number of rows in DataFrame : 10316622


In [9]:
# Examine schema
df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



In [10]:
# Drop unnused columns
df = df.drop('marketplace','product_category','verified_purchase','review_headline','review_body')

In [11]:
# Convert numerical strings to integers to match schema
from pyspark.sql.types import IntegerType
df = df.withColumn("customer_id", df["customer_id"].cast(IntegerType()))
df = df.withColumn("product_parent", df["product_parent"].cast(IntegerType()))
df = df.withColumn("star_rating", df["star_rating"].cast(IntegerType()))
df = df.withColumn("helpful_votes", df["helpful_votes"].cast(IntegerType()))
df = df.withColumn("total_votes", df["total_votes"].cast(IntegerType()))

In [12]:
# Convert date strings to date types to match schema
from pyspark.sql.functions import col, to_date
df = df.withColumn("review_date",to_date(col("review_date"),"yyyy-mm-dd"))

In [13]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- review_date: date (nullable = true)



In [14]:
review_id_df = df.select(["review_id", "customer_id","product_id","product_parent","review_date"])
review_id_df.show()

+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R10014V9AVMQ73|   28941543|0137150830|     854491944| 2014-01-18|
|R1004FYSAYGYNZ|      28650|8183860664|     188251931| 2015-01-19|
|R10086W0U9OHY1|   35531538|1421695103|     828616333| 2012-01-28|
|R100LDX2L70YXF|   37127528|0439919045|     372719167| 2014-01-25|
|R100OI6P955T8N|   18451121|0764585924|     843831426| 2014-01-12|
|R100QVQSCIC23H|   27809831|1450252001|     483095575| 2014-01-11|
|R100REDE8CZG1Z|   13544187|0955176247|     348681061| 2013-01-20|
|R100U5WN59IKMO|   38788167|1561706124|     436984722| 2015-01-03|
|R100WKCXPC71SI|    9246053|0528011499|     607239732| 2015-01-07|
|R100WYUUD0OU9H|   44142581|0692445374|     272422978| 2015-01-01|
|R1011S5OVDI8X4|   24486045|1591030250|     596523810| 2012-01-21|
|R10137BN9ZX753|   18634997|1780974574|     448101930| 2014-01

In [15]:
products_df = df.dropDuplicates(["product_id"]).select("product_id","product_title")
products_df.show()

+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|0001064487|Celebremos Su Gloria|
|0001203088|Hilda Boswell's O...|
|0001380753|The Great Big Pad...|
|0001383647|Sesame Street Pre...|
|0001384198|The Little Engine...|
|0001622137|Tom Swift and His...|
|0001632132| Shadow the Sheepdog|
|0001711237|Little Black Goes...|
|0001711415|Tubby and the Lan...|
|0001713256|Go, Dog. Go! (Beg...|
|0001720279|Hooray for Diffen...|
|0001821121|Paddington Bear (...|
|0001839225|Spring Story (Bra...|
|0001839233|Summer Story (Bra...|
|0001845357|    Ballad of Favour|
|0001850164| The Rainbow Serpent|
|0001855085|The Book of 1000 ...|
|0001856871|           Dear Olly|
|0001857010|The Lion, the Wit...|
|0001857169|A Treasury of Narnia|
+----------+--------------------+
only showing top 20 rows



In [16]:
customers_df = df.groupBy("customer_id").count().withColumnRenamed("count","customer_count")
customers_df.show()

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|   52942195|             2|
|   34912862|             3|
|   52013099|            31|
|   12325199|             1|
|   50785793|            59|
|   34302706|             1|
|   38047559|            19|
|   20366955|             1|
|   18989639|             7|
|   12294767|             4|
|   13238638|             1|
|   18950477|            46|
|    5334838|            26|
|   30655165|            10|
|   36925110|             6|
|   38086969|            15|
|   49657151|             2|
|   11914419|             7|
|   34608604|            80|
|   42177283|             1|
+-----------+--------------+
only showing top 20 rows



In [17]:
vine_df = df.select("review_id","star_rating","helpful_votes","total_votes","vine")
vine_df.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R10014V9AVMQ73|          5|            1|          1|   N|
|R1004FYSAYGYNZ|          4|            1|          1|   N|
|R10086W0U9OHY1|          5|            0|          0|   N|
|R100LDX2L70YXF|          5|            0|          0|   N|
|R100OI6P955T8N|          5|            1|          1|   N|
|R100QVQSCIC23H|          3|            0|          0|   N|
|R100REDE8CZG1Z|          5|            2|          3|   N|
|R100U5WN59IKMO|          5|            1|          1|   N|
|R100WKCXPC71SI|          5|            0|          0|   N|
|R100WYUUD0OU9H|          5|            0|          0|   N|
|R1011S5OVDI8X4|          5|            3|          3|   N|
|R10137BN9ZX753|          5|            0|          1|   N|
|R101LVIUDBR3DQ|          5|            1|          1|   N|
|R101N0ZLJNPWNA|          4|            

In [18]:
# Configuration for RDS instance
mode="append"
jdbc_url = "jdbc:postgresql://database-1.cnudimefwqi6.us-west-1.rds.amazonaws.com:5432/my_data_class_db2"
config = {"user":"root",
          "password": "goceltics!",
          "driver":"org.postgresql.Driver"}

In [19]:
# Write DataFrames to RDS
review_id_df.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)

In [20]:
products_df.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=config)


In [21]:
customers_df.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=config)

In [22]:
vine_df.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)