# Goal

The goal of this notebook is to query the AWS RDS of Amazon Video Downloads.

# Environment Setup and Dependencies

In [None]:
# Import AWS RDS configuration
from google.colab import files
src = list(files.upload().values())[0]
open('config.py','wb').write(src)
from config import username, password, rds_url

Saving config.py to config (3).py


In [None]:
# Dependencies
import os

# set spark version
spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark


# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to cloud.r-project.or0% [1 InRelease gpgv 1,581 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [1 InRelease gpgv 1,581 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [1 InRelease gpgv 1,581 B] [Waiting for headers] [Waiting for headers] [Conn                                                                               Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [1 InRelease gpgv 1,581 B] [Waiting for headers] [4 InRelease 14.2 kB/88.7 k                             

In [None]:
# postgres connection
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2022-08-03 15:56:52--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar.3’


2022-08-03 15:56:53 (10.7 MB/s) - ‘postgresql-42.2.9.jar.3’ saved [914037/914037]



In [None]:
# setup pyspark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AmazonQuery")\
  .config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar")\
  .getOrCreate()

# Query Postgres Database

In [None]:
# Configure settings for RDS
# use imported variables from config.py
jdbc_url=f"jdbc:postgresql://{rds_url}"
config = {"user": username, 
          "password": password, 
          "driver":"org.postgresql.Driver"}

# Direct SQL Query

In [None]:
query = "(select * from vine_table) as Vine"

In [None]:
df2 = spark.read.jdbc(url=jdbc_url, table=query, properties=config)

In [None]:
df2.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R1006IKFAOWPPC|          5|            0|          0|   N|
|R1006V1JWI9VYM|          4|            0|          0|   N|
|R100DP5939VLI1|          4|            0|          0|   N|
|R100SNQ08T1G0W|          3|            1|          1|   N|
|R100XEX1Y95QEO|          5|            0|          0|   N|
|R100ZVN2C953CB|          4|            0|          0|   N|
|R1011QRZ1IRBX3|          5|            3|          3|   N|
|R101EJU3ET0EZ8|          4|            0|          0|   N|
|R101KJVKVYEVGA|          4|            0|          0|   N|
|R101O01CUFADI0|          5|            0|          2|   N|
|R101OFEXKHXSGX|          5|            0|          0|   N|
|R101TKMOB0YTJ5|          4|            0|          0|   N|
|R101W2SHXLSF95|          5|            0|          0|   N|
|R1022V48MM5TGY|          5|            

# Table-only Queary

In [None]:
# Write DataFrame to vine_table in RDS

df = spark.read.format("jdbc")\
    .option("url", jdbc_url)\
    .option("driver", "org.postgresql.Driver")\
    .option("dbtable", "vine_table")\
    .option("user", username)\
    .option("password", password)\
    .load()

In [None]:
df.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R1006IKFAOWPPC|          5|            0|          0|   N|
|R1006V1JWI9VYM|          4|            0|          0|   N|
|R100DP5939VLI1|          4|            0|          0|   N|
|R100SNQ08T1G0W|          3|            1|          1|   N|
|R100XEX1Y95QEO|          5|            0|          0|   N|
|R100ZVN2C953CB|          4|            0|          0|   N|
|R1011QRZ1IRBX3|          5|            3|          3|   N|
|R101EJU3ET0EZ8|          4|            0|          0|   N|
|R101KJVKVYEVGA|          4|            0|          0|   N|
|R101O01CUFADI0|          5|            0|          2|   N|
|R101OFEXKHXSGX|          5|            0|          0|   N|
|R101TKMOB0YTJ5|          4|            0|          0|   N|
|R101W2SHXLSF95|          5|            0|          0|   N|
|R1022V48MM5TGY|          5|            

# Comments

*  Additional filters can be added and won't be executed until .show(), count(), collect(), take(n), top(), countByValue(), reduce(), fold(), aggregate(), foreach()

_https://data-flair.training/blogs/spark-rdd-operations-transformations-actions/_


# Recreate Original Dataframe