Charlie Perez - DS 2002 SP23 - Final Project

This is my final project. It mainly builds upon my midterm project, using much of the same source data.

Import necessary libraries

In [0]:
import os
import json
import pymongo
import pyspark.pandas as pd  # This uses Koalas that is included in PySpark version 3.2 or newer.
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BinaryType
from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, DecimalType
import datetime

Create global variables

In [0]:
# Azure MySQL Server Connection Information ###################
jdbc_hostname = "cwp5xyj-mysql.mysql.database.azure.com"
jdbc_port = 3306
src_database = "sakila_datawarehouse"

connection_properties = {
  "user" : "cwp5xyj",
  "password" : "Azuresql1",
  "driver" : "org.mariadb.jdbc.Driver"
}

# MongoDB Atlas Connection Information ########################
atlas_database_name = "mflix"
atlas_cluster_name = "cluster0.ynxna8b"
atlas_user_name = "cwp5xyj"
atlas_password = "nMW0O0gwwFHpBjZT"

dst_database = "sakila_dlh"

base_dir = "dbfs:/FileStore/tables/sakila-project"
database_dir = f"{base_dir}/{dst_database}"

# Return to this portion before trying to run anything -- need to update the destinations based on what I use

rentals_stream_dir = f"dbfs:/FileStore/tables/source_data/stream/rentals"

rentals_output_bronze = f"{database_dir}/fact_rentals/bronze"
rentals_output_silver = f"{database_dir}/fact_rentals/silver"
rentals_output_gold   = f"{database_dir}/fact_rentals/gold"


# Delete the Streaming Files ################################## 
dbutils.fs.rm(f"{database_dir}/fact_rentals", True) 

# Delete the Database Files ###################################
dbutils.fs.rm(database_dir, True)


Out[2]: False

Define global function (don't need to set a dataframe because the data is already in Mongo)

In [0]:
# ######################################################################################################################
# Use this Function to Fetch a DataFrame from the MongoDB Atlas database server Using PyMongo.
# ######################################################################################################################
def get_mongo_dataframe(user_id, pwd, cluster_name, db_name, collection, conditions, projection, sort):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://{user_id}:{pwd}@{cluster_name}.mongodb.net/{db_name}"
    
    client = pymongo.MongoClient(mongo_uri)

    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    if conditions and projection and sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection).sort(sort)))
    elif conditions and projection and not sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection)))
    else:
        dframe = pd.DataFrame(list(db[collection].find()))

    client.close()
    
    return dframe

Create new Databricks metadata database

In [0]:
%sql
DROP DATABASE IF EXISTS sakila_dlh CASCADE;

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS sakila_dlh
COMMENT "Sakila Final Project Database"
LOCATION "dbfs:/FileStore/tables/sakila-project/sakila_dlh"
WITH DBPROPERTIES (contains_pii = true, purpose = "Final for DS-2002");

Create date dimension table from MySQL database

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW view_date
USING org.apache.spark.sql.jdbc
OPTIONS (
  url "jdbc:mysql://cwp5xyj-mysql.mysql.database.azure.com:3306/sakila_datawarehouse",
  dbtable "dim_date",
  user "cwp5xyj",
  password "Azuresql1"
)

In [0]:
%sql
USE DATABASE sakila_dlh;

CREATE OR REPLACE TABLE sakila_dlh.dim_date
COMMENT "Date Dimension Table"
LOCATION "dbfs:/FileStore/tables/sakila-project/sakila_dlh/dim_date"
AS SELECT * FROM view_date

num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_date LIMIT 5

date_key,full_date,date_name,date_name_us,date_name_eu,day_of_week,day_name_of_week,day_of_month,day_of_year,weekday_weekend,week_of_year,month_name,month_of_year,is_last_day_of_month,calendar_quarter,calendar_year,calendar_year_month,calendar_year_qtr,fiscal_month_of_year,fiscal_quarter,fiscal_year,fiscal_year_month,fiscal_year_qtr
20000101,2000-01-01,2000/01/01,01/01/2000,01/01/2000,7,Saturday,1,1,Weekend,52,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000102,2000-01-02,2000/01/02,01/02/2000,02/01/2000,1,Sunday,2,2,Weekend,52,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000103,2000-01-03,2000/01/03,01/03/2000,03/01/2000,2,Monday,3,3,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000104,2000-01-04,2000/01/04,01/04/2000,04/01/2000,3,Tuesday,4,4,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000105,2000-01-05,2000/01/05,01/05/2000,05/01/2000,4,Wednesday,5,5,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3


Create new table that sources inventory data from sakila_datawarehouse

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW view_inventory
USING org.apache.spark.sql.jdbc
OPTIONS (
  url "jdbc:mysql://cwp5xyj-mysql.mysql.database.azure.com:3306/sakila_datawarehouse",
  dbtable "dim_inventory",
  user "cwp5xyj",
  password "Azuresql1"
)

In [0]:
%sql
USE DATABASE sakila_dlh;
CREATE OR REPLACE TABLE sakila_dlh.dim_inventory
COMMENT "Inventory Dimension Table"
LOCATION "dbfs:/FileStore/tables/sakila-project/sakila_dlh/dim_inventory"
AS SELECT * FROM view_inventory

num_affected_rows,num_inserted_rows


In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_inventory;

col_name,data_type,comment
inventory_key,bigint,
movie_key,bigint,
store_id,bigint,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,sakila_dlh,
Table,dim_inventory,
Type,EXTERNAL,
Comment,Inventory Dimension Table,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_inventory LIMIT 5

inventory_key,movie_key,store_id
1,1,1
2,1,1
3,1,1
4,1,1
5,1,2


Create new customers table that sources data from sakila_datawarehouse

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW view_customers
USING org.apache.spark.sql.jdbc
OPTIONS (
  url "jdbc:mysql://cwp5xyj-mysql.mysql.database.azure.com:3306/sakila_datawarehouse",
  dbtable "dim_customers",
  user "cwp5xyj",
  password "Azuresql1"
)

In [0]:
%sql
USE DATABASE sakila_dlh;
CREATE OR REPLACE TABLE sakila_dlh.dim_customers
COMMENT "Customers Dimension Table"
LOCATION "dbfs:/FileStore/tables/sakila-project/sakila_dlh/dim_customers"
AS SELECT * FROM view_customers

num_affected_rows,num_inserted_rows


In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_customers;

col_name,data_type,comment
customer_key,bigint,
first_name,string,
last_name,string,
company,string,
city,string,
country,string,
phone1,string,
phone2,string,
email,string,
,,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_customers LIMIT 5

customer_key,first_name,last_name,company,city,country,phone1,phone2,email
1,Andrew,Goodman,Stewart-Flynn,Rowlandberg,Macao,846-790-4623x4715,(422)787-2331x71127,marieyates@gomez-spencer.info
2,Alvin,Lane,"Terry, Proctor and Lawrence",Bethside,Papua New Guinea,124-597-8652x05682,321.441.0588x6218,alexandra86@mccoy.com
3,Jenna,Harding,Bailey Group,Moniquemouth,China,(335)987-3085x3780,001-680-204-8312,justincurtis@pierce.org
4,Fernando,Ford,Moss-Maxwell,Leeborough,Macao,(047)752-3122,048.779.5035x9122,adeleon@hubbard.org
5,Kara,Woods,Mccarthy-Kelley,Port Jacksonland,Nepal,+1-360-693-4419x19272,163-627-2565,jesus90@roberson.info


Upload json file containing revamped data (see Jupyter Notebook file included for reference) to MongoDB

In [0]:
# ######################################################################################################################
# Use this Function to Create New Collections by Uploading JSON file(s) to the MongoDB Atlas server.
# ######################################################################################################################
def set_mongo_collection(user_id, pwd, cluster_name, db_name, src_file_path, json_files):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://{user_id}:{pwd}@{cluster_name}.mongodb.net/{db_name}"
    client = pymongo.MongoClient(mongo_uri)
    db = client[db_name]
    
    '''Read in a JSON file, and Use It to Create a New Collection'''
    for file in json_files:
        db.drop_collection(file)
        json_file = os.path.join(src_file_path, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)

    client.close()
    
    return result

In [0]:
source_dir = '/dbfs/FileStore/tables/source_data/batch'
json_files = {"movie_info": 'dim_movies.json'}

set_mongo_collection(atlas_user_name, atlas_password, atlas_cluster_name, atlas_database_name, source_dir, json_files)

Out[18]: <pymongo.results.InsertManyResult at 0x7fd471464440>

Import movie information from mflix database on MongoDB Atlas

In [0]:
%scala
import com.mongodb.spark._

val df_movie_info = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb+srv://cwp5xyj:nMW0O0gwwFHpBjZT@cluster0.ynxna8b.mongodb.net")
.option("database", "mflix").option("collection", "movie_info").load()
.select("genres", "title", "released", "rated", "imdb_rating", "imdb_votes")

display(df_movie_info)

genres,title,released,rated,imdb_rating,imdb_votes
"Biography, Crime, Drama",Regeneration,1915-09-13 00:00:00,PASSED,6.8,626.0
Drama,Where Are My Children?,1916-05-01 00:00:00,APPROVED,5.9,247.0
"Short, Comedy",One Week,1920-09-01 00:00:00,TV-G,8.3,3942.0
"Adventure, Drama",The Chechahcos,1924-05-15 00:00:00,UNRATED,6.6,167.0
"Adventure, Fantasy, Family",Peter Pan,1924-12-29 00:00:00,,7.4,589.0
"Action, Adventure, Drama",Beau Geste,1926-08-25 00:00:00,,6.9,222.0
"Comedy, Drama, Romance",Lonesome,1929-02-01 00:00:00,NOT RATED,8.0,1264.0
"Drama, Romance, Western",The Wind,1928-11-23 00:00:00,NOT RATED,8.4,4291.0
"Drama, Romance, Adventure",Tabu: A Story of the South Seas,1931-03-19 00:00:00,TV-PG,7.7,3036.0
"Comedy, Musical",è Nous la Libertè,1931-12-31 00:00:00,APPROVED,7.7,2900.0


Use the Spark DataFrame to create a new movie info dimension in the Databricks Metadata Database

In [0]:
%scala
df_movie_info.write.format("delta").mode("overwrite").saveAsTable("sakila_dlh.dim_movie_info")

In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_movie_info

col_name,data_type,comment
genres,string,
title,string,
released,string,
rated,string,
imdb_rating,string,
imdb_votes,string,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,sakila_dlh,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_movie_info LIMIT 5

genres,title,released,rated,imdb_rating,imdb_votes
"Biography, Crime, Drama",Regeneration,1915-09-13 00:00:00,PASSED,6.8,626
Drama,Where Are My Children?,1916-05-01 00:00:00,APPROVED,5.9,247
"Short, Comedy",One Week,1920-09-01 00:00:00,TV-G,8.3,3942
"Adventure, Drama",The Chechahcos,1924-05-15 00:00:00,UNRATED,6.6,167
"Adventure, Fantasy, Family",Peter Pan,1924-12-29 00:00:00,,7.4,589


Reference data (cold-path) should now be fully integrated.
Next section will be reading in hot path data from .csv files created in SQL script

In [0]:
(spark.readStream
 .format("cloudFiles")
 .option("cloudFiles.format", "json")
 .option("cloudFiles.schemaHints", "rental_key BIGINT")
 .option("cloudFiles.schemaHints", "inventory_key BIGINT")
 .option("cloudFiles.schemaHints", "customer_key BIGINT")
 .option("cloudFiles.schemaHints", "movie_key BIGINT")
 .option("cloudFiles.schemaHints", "store_id BIGINT")
 .option("cloudFiles.schemaHints", "first_name STRING")
 .option("cloudFiles.schemaHints", "last_name STRING")
 .option("cloudFiles.schemaHints", "company STRING")
 .option("cloudFiles.schemaHints", "city STRING")
 .option("cloudFiles.schemaHints", "country STRING")
 .option("cloudFiles.schemaHints", "phone1 STRING")
 .option("cloudFiles.schemaHints", "phone2 STRING")
 .option("cloudFiles.schemaHints", "email STRING")
 .option("cloudFiles.schemaHints", "genres STRING")
 .option("cloudFiles.schemaHints", "rated STRING")
 .option("cloudFiles.schemaHints", "title STRING")
 .option("cloudFiles.schemaHints", "released DATETIME")
 .option("cloudFiles.schemaHints", "imdb_rating INT")
 .option("cloudFiles.schemaHints", "imdb_votes INT")
 .option("cloudFiles.schemaHints", "rental_date_key BIGINT")
 .option("cloudFiles.schemaHints", "return_date_key BIGINT")
 .option("cloudFiles.schemaLocation", rentals_output_bronze)
 .option("cloudFiles.inferColumnTypes", "true")
 .option("multiLine", "true")
 .load(rentals_stream_dir)
 .createOrReplaceTempView("rentals_raw_tempview"))

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW rentals_bronze_tempview AS (
  SELECT *, current_timestamp() receipt_time, input_file_name() source_file
  FROM rentals_raw_tempview
)

In [0]:
%sql
SELECT * FROM rentals_bronze_tempview

city,company,country,customer_key,email,first_name,genres,imdb_rating,imdb_votes,inventory_key,last_name,movie_key,phone1,phone2,rated,released,rental_date_key,rental_key,return_date_key,store_id,title,_rescued_data,receipt_time,source_file
East Paulaville,Tanner LLC,American Samoa,46,debbie56@baker-olsen.com,Stephanie,"Drama, Romance",7.7,11918,2179,Bradshaw,471,+1-934-296-1820x843,(364)992-5769x31100,PASSED,1939-04-07 00:00:00,20050527,401,20050529,1,Wuthering Heights,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json
New Gerald,Melton and Sons,Martinique,354,lorettamoreno@kerr.net,Jillian,"Comedy, Western",7.3,2258,461,Mccullough,101,(020)498-7132,(902)288-2803x637,NOT RATED,1925-11-01 00:00:00,20050527,402,20050530,2,Go West,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json
South Robertbury,Sosa LLC,Equatorial Guinea,424,ezamora@burke.org,Troy,"Comedy, Musical",6.1,87,3983,Elliott,868,(105)480-0123x26243,5301277009,,1950-09-21 00:00:00,20050527,403,20050529,1,Kaunis Veera eli ballaadi Saimaalta,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json
South Marciafurt,"Barry, Thomas and Oconnor",Morocco,168,masonadriana@price.com,Angel,Drama,6.9,424,1293,Park,285,001-879-705-2671x02795,036.127.3806x095,PASSED,1933-10-06 00:00:00,20050527,404,20050530,2,The Power and the Glory,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json
Deniseburgh,Rush-Melton,Tokelau,272,aaronmorse@shepard.org,Jesus,"Comedy, Sci-Fi, Sport",7.0,1278,4090,Cox,891,(804)948-3991,8181831524,APPROVED,1949-06-01 00:00:00,20050527,405,20050605,2,It Happens Every Spring,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json
West Tammieport,"Graves, Hardin and Cummings",Liechtenstein,381,shawriley@rasmussen.com,Max,"Comedy, Drama, Romance",8.0,14275,2136,Rasmussen,462,001-928-549-3512x57008,(944)703-2933x5609,APPROVED,1936-04-12 00:00:00,20050527,406,20050530,2,Mr. Deeds Goes to Town,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json
West Henry,"Logan, Boyle and Villegas",Panama,44,mjackson@david.com,Krystal,"Drama, Romance, War",8.2,194570,1077,Mendoza,240,001-116-661-7356,001-155-646-7234x7293,G,1940-01-17 00:00:00,20050527,407,20050531,2,Gone with the Wind,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json
Welchburgh,"Wu, Strong and Flynn",Suriname,84,ortegashane@li.com,Sydney,"Animation, Comedy, Family",7.2,389,1438,Solis,314,591-223-5142x5192,+1-720-105-4622,APPROVED,1935-06-26 00:00:00,20050527,408,20050528,2,Who Killed Cock Robin?,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json
Patelhaven,Maynard LLC,Senegal,220,erikdalton@hines.org,Sara,"Comedy, Musical, Romance",6.4,852,3652,Vargas,797,410.944.5826,1762680733,,1949-06-01 00:00:00,20050527,409,20050602,2,Neptune's Daughter,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json
Phillipsbury,Hull Inc,Malawi,506,susan95@burgess.com,Belinda,"Comedy, Romance",6.3,240,4010,Kaiser,873,(088)972-6248,010-132-8522,PASSED,1952-04-07 00:00:00,20050527,410,20050602,1,Too Young to Kiss,,2023-05-12T01:26:34.456+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_3.json


In [0]:
(spark.table("rentals_bronze_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{rentals_output_bronze}/_checkpoint")
      .outputMode("append")
      .table("fact_rentals_bronze"))

Out[24]: <pyspark.sql.streaming.query.StreamingQuery at 0x7fd4702ead30>

Silver: connect to reference data

In [0]:
(spark.readStream
  .table("fact_rentals_bronze")
  .createOrReplaceTempView("rentals_silver_tempview"))

In [0]:
%sql
SELECT * FROM rentals_silver_tempview

city,company,country,customer_key,email,first_name,genres,imdb_rating,imdb_votes,inventory_key,last_name,movie_key,phone1,phone2,rated,released,rental_date_key,rental_key,return_date_key,store_id,title,_rescued_data,receipt_time,source_file
Dudleyfurt,Medina-Castro,Ethiopia,130,ygarcia@andrade.com,Stacey,"Animation, Family, Comedy",7.7,4230,367,Travis,80,835-675-9702x438,+1-421-986-8630,,1930-08-11 00:00:00,20050524,1,20050526,1,Steamboat Willie,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json
Fordborough,Morse Group,Djibouti,459,aaroncollins@nunez.com,Chelsey,"Comedy, Musical",7.2,518,1525,Mcknight,333,+1-949-959-6027x4955,(590)275-0778x25496,,1935-03-24 00:00:00,20050524,2,20050528,2,Moscow Laughs,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json
Coxburgh,"Jenkins, Clarke and Faulkner",Rwanda,408,allenwalter@escobar.biz,Jimmy,"Adventure, Drama",5.6,371,1711,Heath,373,288.812.1579x011,583.956.9435x265,NOT RATED,1935-06-26 00:00:00,20050524,3,20050601,2,Sanders of the River,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json
West Debrahaven,Costa and Sons,Palau,333,traciliu@forbes.com,Peter,"Film-Noir, Mystery, Thriller",8.3,102915,2452,Moore,535,+1-092-294-1882x3430,989-768-1517,NOT RATED,1949-08-31 00:00:00,20050524,4,20050603,1,The Third Man,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json
Santosport,"Nguyen, Ruiz and Finley",Heard Island and McDonald Islands,222,davidchoi@kim.net,Jordan,"Comedy, Family, Romance",8.0,5732,2079,Hanna,450,+1-407-894-3981,+1-304-766-6187x26637,NOT RATED,1925-03-11 00:00:00,20050524,5,20050602,2,Seven Chances,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json
New Michellefort,"Rice, Simpson and Russell",Slovakia (Slovak Republic),549,gregorymcfarland@rodriguez.com,Henry,"Biography, Drama, History",7.3,4244,2792,Contreras,613,355-017-0172x376,0746855808,NOT RATED,1946-06-17 00:00:00,20050524,6,20050527,1,The Chronicle History of King Henry the Fift with His Battell Fought at Agincourt in France,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json
East Darin,"Fuentes, Park and Poole",Tokelau,269,jacksondana@baird.com,James,"Drama, Horror, Sci-Fi",6.5,763,3995,Washington,870,9506037850,529.913.4727,APPROVED,1951-04-25 00:00:00,20050524,7,20050529,2,Five,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json
North Sydney,Reed-Tucker,San Marino,239,amckenzie@leonard-newman.com,Caroline,"Comedy, Drama, History",7.3,1708,2346,Clarke,510,+1-917-288-8837x28207,725-182-3978x780,APPROVED,1954-01-05 00:00:00,20050524,8,20050527,1,The Golden Coach,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json
South Caseyside,"Oneal, Barker and Kaufman",Timor-Leste,126,mercedes83@gill.org,Yvonne,"Drama, Film-Noir, Mystery",7.4,7467,2580,Jordan,565,+1-587-011-4054,001-735-509-2475x253,APPROVED,1950-04-30 00:00:00,20050525,9,20050528,1,D.O.A.,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json
South Tonya,"Myers, Krueger and Sampson",Nauru,399,xholden@walsh.com,Samuel,Drama,6.5,609,1824,Drake,396,301.235.8441x37411,6213378153,PASSED,1933-09-29 00:00:00,20050525,10,20050531,2,The Emperor Jones,,2023-05-12T01:27:15.032+0000,dbfs:/FileStore/tables/source_data/stream/rentals/sakila_rentals_1.json


In [0]:
%sql
DESCRIBE EXTENDED rentals_silver_tempview

col_name,data_type,comment
city,string,
company,string,
country,string,
customer_key,bigint,
email,string,
first_name,string,
genres,string,
imdb_rating,string,
imdb_votes,string,
inventory_key,bigint,


In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW fact_rentals_silver_tempview AS (
  SELECT r.rental_key,
  r.inventory_key,
  i.store_id,
  r.customer_key,
  c.first_name,
  c.last_name,
  c.company,
  c.city,
  c.country,
  c.phone1,
  c.phone2,
  c.email,
  r.movie_key,
  r.title,
  m.genres,
  m.rated,
  m.released,
  m.imdb_rating,
  m.imdb_votes,
  r.rental_date_key,
  od.day_name_of_week AS rental_day_name_of_week,
  od.day_of_month AS rental_day_of_month,
  od.weekday_weekend AS rental_weekday_weekend,
  od.month_name AS rental_month_name,
  od.calendar_quarter AS rental_calendar_quarter,
  od.calendar_year AS rental_calendar_year,
  r.return_date_key,
  rd.day_name_of_week AS returned_day_name_of_week,
  rd.day_of_month AS returned_day_of_month,
  rd.weekday_weekend AS returned_weekday_weekend,
  rd.month_name AS returned_month_name,
  rd.calendar_quarter AS returned_calendar_quarter,
  rd.calendar_year AS returned_calendar_year
FROM rentals_silver_tempview AS r
INNER JOIN sakila_dlh.dim_movie_info AS m
ON m.title = r.title
INNER JOIN sakila_dlh.dim_customers AS c
ON c.customer_key = r.customer_key
INNER JOIN sakila_dlh.dim_inventory AS i
ON i.inventory_key = r.inventory_key
LEFT OUTER JOIN sakila_dlh.dim_date AS od
ON od.date_key = r.rental_date_key
LEFT OUTER JOIN sakila_dlh.dim_date AS rd
ON rd.date_key = r.return_date_key
)

In [0]:
(spark.table("fact_rentals_silver_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{rentals_output_silver}/_checkpoint")
      .outputMode("append")
      .table("fact_rentals_silver"))

Out[34]: <pyspark.sql.streaming.query.StreamingQuery at 0x7fd469f563d0>

In [0]:
%sql
SELECT * FROM fact_rentals_silver

rental_key,inventory_key,store_id,customer_key,first_name,last_name,company,city,country,phone1,phone2,email,movie_key,title,genres,rated,released,imdb_rating,imdb_votes,rental_date_key,rental_day_name_of_week,rental_day_of_month,rental_weekday_weekend,rental_month_name,rental_calendar_quarter,rental_calendar_year,return_date_key,returned_day_name_of_week,returned_day_of_month,returned_weekday_weekend,returned_month_name,returned_calendar_quarter,returned_calendar_year
361,6,2,587,Jean,Mclaughlin,"Rivas, Frey and Figueroa",Thomasstad,Lao People's Democratic Republic,316-885-2486x37693,(074)599-4682,jeffrey93@russell.biz,1,Regeneration,"Biography, Crime, Drama",PASSED,1915-09-13 00:00:00,6.8,626,20050527,Friday,27,Weekday,May,2,2005,20050531,Tuesday,31,Weekday,May,2,2005
465,20,2,261,Natasha,Schmitt,Russo PLC,New Tammy,Iceland,051.545.2869x0567,+1-118-630-5686x211,kelliewaters@fox.com,4,The Chechahcos,"Adventure, Drama",UNRATED,1924-05-15 00:00:00,6.6,167,20050527,Friday,27,Weekday,May,2,2005,20050602,Thursday,2,Weekday,June,2,2005
552,23,2,106,Pam,Crane,Patton-English,East Taylorborough,Cameroon,679.659.0893,(695)869-8220x5302,luisreynolds@caldwell.com,5,Peter Pan,"Adventure, Fantasy, Family",,1924-12-29 00:00:00,7.4,589,20050528,Saturday,28,Weekend,May,2,2005,20050604,Saturday,4,Weekend,June,2,2005
317,26,1,391,Timothy,Chambers,Alexander-Farrell,Pammouth,Afghanistan,001-963-502-7652x654,+1-197-026-8212x6146,sean10@fletcher.com,6,Beau Geste,"Action, Adventure, Drama",,1926-08-25 00:00:00,6.9,222,20050526,Thursday,26,Weekday,May,2,2005,20050601,Wednesday,1,Weekday,June,2,2005
259,30,2,482,Audrey,Goodman,Jenkins-Murillo,South Rogerhaven,Yemen,2530282881,119-162-3054x983,martinjeanette@petersen.com,6,Beau Geste,"Action, Adventure, Drama",,1926-08-25 00:00:00,6.9,222,20050526,Thursday,26,Weekday,May,2,2005,20050604,Saturday,4,Weekend,June,2,2005
120,37,2,365,Virginia,Goodman,Salazar Ltd,Jameston,Morocco,139-882-2263x6505,(223)999-5846,dawn64@brock.com,8,The Wind,"Drama, Romance, Western",NOT RATED,1928-11-23 00:00:00,8.4,4291,20050525,Wednesday,25,Weekday,May,2,2005,20050601,Wednesday,1,Weekday,June,2,2005
563,53,1,324,Shirley,Bowman,Roman LLC,East Dianeport,Reunion,135.578.2855,(079)827-8903x8117,nicholasvaldez@wall.com,11,Broken Lullaby,Drama,,1932-01-24 00:00:00,7.6,499,20050528,Saturday,28,Weekend,May,2,2005,20050606,Monday,6,Weekday,June,2,2005
505,71,1,111,Darrell,Small,Nicholson LLC,Lake Warrenmouth,Reunion,(962)163-3676,(852)503-0393,boyerjoy@gross-meadows.info,15,Zoo in Budapest,"Drama, Romance",PASSED,1933-04-28 00:00:00,7.0,295,20050528,Saturday,28,Weekend,May,2,2005,20050529,Sunday,29,Weekend,May,2,2005
481,72,1,445,Sheri,Perez,"Velasquez, Haynes and Parks",East Krystalland,United States Virgin Islands,137.658.0730x2226,(076)326-7301,kyle22@robbins-trevino.net,15,Zoo in Budapest,"Drama, Romance",PASSED,1933-04-28 00:00:00,7.0,295,20050527,Friday,27,Weekday,May,2,2005,20050530,Monday,30,Weekday,May,2,2005
64,79,2,368,Sally,Hinton,Glover-Mccoy,Guyside,Montserrat,265.484.0824,164.346.9633,rose26@navarro.biz,16,The Thin Man,"Comedy, Crime, Mystery",NOT RATED,1934-05-25 00:00:00,8.2,19369,20050525,Wednesday,25,Weekday,May,2,2005,20050603,Friday,3,Weekday,June,2,2005


In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.fact_rentals_silver

col_name,data_type,comment
rental_key,bigint,
inventory_key,bigint,
store_id,bigint,
customer_key,bigint,
first_name,string,
last_name,string,
company,string,
city,string,
country,string,
phone1,string,


Gold: perform aggregation

Grouping customers by number of rentals and average imdb rating of the movie

In [0]:
%sql
SELECT customer_key AS customerID,
first_name AS customerFirst,
last_name AS customerLast,
AVG(imdb_rating) AS avg_imdb_rating,
COUNT(title) AS total_movies
FROM sakila_dlh.fact_rentals_silver
GROUP BY customerID, customerFirst, customerLast
ORDER BY total_movies DESC;

customerID,customerFirst,customerLast,avg_imdb_rating,total_movies
142,Autumn,Cuevas,7.5,9
251,Jackson,Grimes,6.6625000000000005,8
274,Paul,Meyers,6.8125,8
469,Travis,Duran,7.071428571428571,7
516,Andrea,Dennis,7.428571428571429,7
19,Joann,Finley,7.25,6
110,Julie,Montgomery,6.9,6
468,Joe,Cabrera,7.766666666666666,6
176,Bradley,Blair,6.400000000000001,6
249,Alisha,Gallegos,6.933333333333334,6


Clean up file system

In [0]:
%fs rm -r /FileStore/tables/sakila-project/