# PySpark Demo Notebook
## Steps
1. Deploy Docker stack
2. Create PostgreSQL table and sample data using sql file
2. Run Jupyter Notebook to test Docker stack

In [1]:
from pyspark.sql import SparkSession, DataFrameReader
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DataType

In [2]:
spark = SparkSession\
    .builder\
    .appName('pyspark_demo_app')\
    .config('spark.driver.extraClassPath', 
            '/home/garystafford/work/postgresql-42.2.5.jar')\
    .getOrCreate()

sc = spark.sparkContext

### Existing Data
Read exising data from PostgreSQL database

In [3]:
properties = {
    'driver': 'org.postgresql.Driver'
}
url = 'jdbc:postgresql://postgres:5432/demo'

df1 = spark.read \
    .format('jdbc') \
    .option('url', url) \
    .option('user', 'postgres') \
    .option('password', 'postgres1234') \
    .option('driver', properties['driver']) \
    .option('dbtable', 'bakery_basket') \
    .load()

In [4]:
df1.show()

+---+----------+--------+-----------+-------------+
| id|      date|    time|transaction|         item|
+---+----------+--------+-----------+-------------+
|  1|2016-10-30|09:58:11|          1|        Bread|
|  2|2016-10-30|10:05:34|          2| Scandinavian|
|  3|2016-10-30|10:07:57|          3|Hot chocolate|
+---+----------+--------+-----------+-------------+



### Add Row to Table
Add new row of data to PostgreSQL database

In [5]:
data = [('2016-10-30', '10:13:27', 2, 'Pastry')]

schema = StructType([
    StructField('date', StringType(), True),
    StructField('time', StringType(), True),
    StructField('transaction', IntegerType(), True),
    StructField('item', StringType(), True)
])

df2 = spark.createDataFrame(data, schema)

In [6]:
df2.write \
    .format('jdbc') \
    .option('url', url) \
    .option('user', 'postgres') \
    .option('password', 'postgres1234') \
    .option('driver', properties['driver']) \
    .option('dbtable', 'bakery_basket') \
    .mode('append') \
    .save()

In [7]:
df1.show()

+---+----------+--------+-----------+-------------+
| id|      date|    time|transaction|         item|
+---+----------+--------+-----------+-------------+
|  1|2016-10-30|09:58:11|          1|        Bread|
|  2|2016-10-30|10:05:34|          2| Scandinavian|
|  3|2016-10-30|10:07:57|          3|Hot chocolate|
|  1|2016-10-30|10:13:27|          2|       Pastry|
+---+----------+--------+-----------+-------------+



### Read CSV Data and Write to Database
Read in Kaggle dataset from CSV file and append to existing PostgreSQL data

In [8]:
kaggle_data = spark.read \
    .format("csv") \
    .option("header","true") \
    .load("BreadBasket_DMS.csv", schema = schema)

In [9]:
kaggle_data.show(10)

+----------+--------+-----------+-------------+
|      date|    time|transaction|         item|
+----------+--------+-----------+-------------+
|2016-10-30|09:58:11|          1|        Bread|
|2016-10-30|10:05:34|          2| Scandinavian|
|2016-10-30|10:05:34|          2| Scandinavian|
|2016-10-30|10:07:57|          3|Hot chocolate|
|2016-10-30|10:07:57|          3|          Jam|
|2016-10-30|10:07:57|          3|      Cookies|
|2016-10-30|10:08:41|          4|       Muffin|
|2016-10-30|10:13:03|          5|       Coffee|
|2016-10-30|10:13:03|          5|       Pastry|
|2016-10-30|10:13:03|          5|        Bread|
+----------+--------+-----------+-------------+
only showing top 10 rows



In [10]:
kaggle_data.write \
    .format('jdbc') \
    .option('url', url) \
    .option('user', 'postgres') \
    .option('password', 'postgres1234') \
    .option('driver', properties['driver']) \
    .option('dbtable', 'bakery_basket') \
    .mode('append') \
    .save()

In [11]:
df1.show(10)

+---+----------+--------+-----------+-------------+
| id|      date|    time|transaction|         item|
+---+----------+--------+-----------+-------------+
|  1|2016-10-30|09:58:11|          1|        Bread|
|  2|2016-10-30|10:05:34|          2| Scandinavian|
|  3|2016-10-30|10:07:57|          3|Hot chocolate|
|  1|2016-10-30|10:13:27|          2|       Pastry|
|  2|2016-10-30|09:58:11|          1|        Bread|
|  3|2016-10-30|10:05:34|          2| Scandinavian|
|  4|2016-10-30|10:05:34|          2| Scandinavian|
|  5|2016-10-30|10:07:57|          3|Hot chocolate|
|  6|2016-10-30|10:07:57|          3|          Jam|
|  7|2016-10-30|10:07:57|          3|      Cookies|
+---+----------+--------+-----------+-------------+
only showing top 10 rows



### Analyze Data with Spark SQL
Analyze bakery data using Spark SQL

In [12]:
df1.createOrReplaceTempView("bakery_table")
df1 = spark.sql("SELECT * FROM bakery_table ORDER BY date, time")
df1.show(15)

+---+----------+--------+-----------+-------------+
| id|      date|    time|transaction|         item|
+---+----------+--------+-----------+-------------+
|  1|2016-10-30|09:58:11|          1|        Bread|
|  2|2016-10-30|09:58:11|          1|        Bread|
|  3|2016-10-30|10:05:34|          2| Scandinavian|
|  4|2016-10-30|10:05:34|          2| Scandinavian|
|  2|2016-10-30|10:05:34|          2| Scandinavian|
|  3|2016-10-30|10:07:57|          3|Hot chocolate|
|  7|2016-10-30|10:07:57|          3|      Cookies|
|  6|2016-10-30|10:07:57|          3|          Jam|
|  5|2016-10-30|10:07:57|          3|Hot chocolate|
|  8|2016-10-30|10:08:41|          4|       Muffin|
| 11|2016-10-30|10:13:03|          5|        Bread|
|  9|2016-10-30|10:13:03|          5|       Coffee|
| 10|2016-10-30|10:13:03|          5|       Pastry|
|  1|2016-10-30|10:13:27|          2|       Pastry|
| 12|2016-10-30|10:16:55|          6|    Medialuna|
+---+----------+--------+-----------+-------------+
only showing

In [13]:
df1 = spark.sql("SELECT item, count(*) as count FROM bakery_table GROUP BY item ORDER BY count DESC")
df1.show(10)

+-------------+-----+
|         item|count|
+-------------+-----+
|       Coffee| 5471|
|        Bread| 3326|
|          Tea| 1435|
|         Cake| 1025|
|       Pastry|  857|
|         NONE|  786|
|     Sandwich|  771|
|    Medialuna|  616|
|Hot chocolate|  591|
|      Cookies|  540|
+-------------+-----+
only showing top 10 rows

