### Notebook Set-Up

#### Import Required Modules

In [1]:
import os
import json
import duckdb

from typing import List, Dict
from dotenv import load_dotenv
from datetime import datetime

import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window

#### Load Secrets as Environment Variables

In [33]:
load_dotenv()

True

#### Get or Create SparkSession

In [29]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("error")
spark

#### Create a DuckDB Connection

In [4]:
cnx = duckdb.connect('../app/database.duckdb')
cnx

<duckdb.duckdb.DuckDBPyConnection at 0x122fceef0>

### Populate the Database with the Static Kaggle Data

#### Populate the `movies` Table

In [5]:
result = cnx.execute(f"PRAGMA table_info('movies');").fetchall()
columns = [row[1] for row in result]
columns

['tmdb_id',
 'title',
 'release_date',
 'runtime',
 'genres',
 'keywords',
 'overview',
 'budget',
 'revenue',
 'popularity',
 'vote_average',
 'vote_count',
 'updated_at']

In [6]:
movies_spark = spark.read.parquet("../data/clean/movies").withColumn("updated_at", f.lit(datetime.now())).select(columns)
movies_spark.printSchema()
movies_spark.show(1, vertical=True)

                                                                                

root
 |-- tmdb_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- runtime: double (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overview: string (nullable = true)
 |-- budget: integer (nullable = true)
 |-- revenue: integer (nullable = true)
 |-- popularity: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (nullable = true)
 |-- updated_at: timestamp (nullable = false)



[Stage 1:>                                                          (0 + 1) / 1]

-RECORD 0----------------------------
 tmdb_id      | 10010                
 title        | Brother Bear 2       
 release_date | 2006-08-17           
 runtime      | 73.0                 
 genres       | [Adventure, Anima... 
 keywords     | [grizzly bear, hu... 
 overview     | Kenai finds his c... 
 budget       | null                 
 revenue      | null                 
 popularity   | 10.861154            
 vote_average | 6.3                  
 vote_count   | 318                  
 updated_at   | 2023-10-21 21:33:... 
only showing top 1 row



                                                                                

In [7]:
movies_pandas = movies_spark.toPandas()
movies_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6918 entries, 0 to 6917
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   tmdb_id       6918 non-null   object        
 1   title         6918 non-null   object        
 2   release_date  6918 non-null   object        
 3   runtime       6918 non-null   float64       
 4   genres        6918 non-null   object        
 5   keywords      6674 non-null   object        
 6   overview      6918 non-null   object        
 7   budget        4252 non-null   float64       
 8   revenue       4252 non-null   float64       
 9   popularity    6918 non-null   float64       
 10  vote_average  6914 non-null   float64       
 11  vote_count    6916 non-null   float64       
 12  updated_at    6918 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(6), object(6)
memory usage: 702.7+ KB


In [8]:
cnx.execute("DELETE FROM movies")
cnx.register("movies_pandas", movies_pandas)
cnx.execute("INSERT INTO movies SELECT * FROM movies_pandas")
cnx.execute("CHECKPOINT")

<duckdb.duckdb.DuckDBPyConnection at 0x122fceef0>

#### Populate the `ratings` Table

In [9]:
result = cnx.execute(f"PRAGMA table_info('ratings');").fetchall()
columns = [row[1] for row in result]
columns

['user_id', 'tmdb_id', 'rating', 'updated_at']

In [10]:
ratings_spark = spark \
  .read.parquet("../data/clean/ratings") \
  .withColumn("updated_at", f.lit(datetime.now())) \
  .withColumn("rank", f.row_number().over(Window.partitionBy("user_id", "tmdb_id").orderBy(f.rand(seed=1492)))) \
  .filter(f.col("tmdb_id").isNotNull()) \
  .filter(f.col("rank") == 1) \
  .select(columns)

ratings_spark.printSchema()
ratings_spark.show(10)

root
 |-- user_id: string (nullable = true)
 |-- tmdb_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- updated_at: timestamp (nullable = false)



[Stage 6:>                                                          (0 + 1) / 1]

+-------+-------+------+--------------------+
|user_id|tmdb_id|rating|          updated_at|
+-------+-------+------+--------------------+
|      1|   9685|   5.0|2023-10-21 21:33:...|
|    100|    765|   3.5|2023-10-21 21:33:...|
|  10000|  82690|   5.0|2023-10-21 21:33:...|
| 100000|    280|   4.0|2023-10-21 21:33:...|
| 100001|  10158|   4.0|2023-10-21 21:33:...|
| 100002|  10923|   3.5|2023-10-21 21:33:...|
| 100002|  11529|   4.0|2023-10-21 21:33:...|
| 100002|    120|   4.0|2023-10-21 21:33:...|
| 100002|    121|   4.0|2023-10-21 21:33:...|
| 100002|  15472|   5.0|2023-10-21 21:33:...|
+-------+-------+------+--------------------+
only showing top 10 rows



                                                                                

In [12]:
ratings_splits = ratings_spark.randomSplit([0.1] * 10)

In [13]:
cnx.execute("DELETE FROM ratings")
for i, split in enumerate(ratings_splits):
    print(f"split={i}")
    ratings_pandas = split.toPandas()
    cnx.register("ratings_pandas", ratings_pandas)
    cnx.execute("INSERT INTO ratings SELECT * FROM ratings_pandas")
cnx.execute("CHECKPOINT")

split=0


                                                                                

split=1


                                                                                

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

split=2


                                                                                

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

split=3


                                                                                

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

split=4


                                                                                

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

split=5


                                                                                

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

split=6


                                                                                

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

split=7


                                                                                

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

split=8


                                                                                

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

split=9


                                                                                

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x122fceef0>

#### Populate the `users` Table

In [14]:
result = cnx.execute(f"PRAGMA table_info('users');").fetchall()
columns = [row[1] for row in result]
columns

['user_id', 'fname', 'lname', 'email', 'updated_at']

In [17]:
users_spark = ratings_spark \
    .select('user_id') \
    .distinct() \
    .withColumn("fname", f.lit("ANONYMOUS")) \
    .withColumn("lname", f.lit("ANONYMOUS")) \
    .withColumn("email", f.lit("ANONYMOUS@ANONYMOUS.COM")) \
    .withColumn("updated_at", f.lit(datetime.now())) \
    .select(columns)

users_spark.show(10)
users_spark.count()

                                                                                

+-------+---------+---------+--------------------+--------------------+
|user_id|    fname|    lname|               email|          updated_at|
+-------+---------+---------+--------------------+--------------------+
| 100010|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
| 100140|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
| 100227|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
| 100263|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
| 100320|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
| 100553|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
| 100704|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
| 100735|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
| 100768|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
|  10096|ANONYMOUS|ANONYMOUS|ANONYMOUS@ANONYMO...|2023-10-21 21:50:...|
+-------+---------+---------+--------------------+--------------

                                                                                

270888

In [18]:
users_pandas = users_spark.toPandas()
users_pandas.info()

                                                                                

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270888 entries, 0 to 270887
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   user_id     270888 non-null  object        
 1   fname       270888 non-null  object        
 2   lname       270888 non-null  object        
 3   email       270888 non-null  object        
 4   updated_at  270888 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 10.3+ MB


In [19]:
cnx.execute("DELETE FROM users")
cnx.register("users_pandas", users_pandas)
cnx.execute("INSERT INTO users SELECT * FROM users_pandas")
cnx.execute("CHECKPOINT")

<duckdb.duckdb.DuckDBPyConnection at 0x122fceef0>

#### Close the Connection to Flush Changes to the Database

In [20]:
cnx.close()

### Test Out Query Functionality Using SQLAlchemy

In [22]:
from sqlalchemy import create_engine, MetaData, Table, select, text

In [28]:
engine = create_engine(f"duckdb:///../app/database.duckdb")
with engine.connect() as cnx:
    result = cnx.execute(text("SELECT * FROM movies LIMIT 5"))
    for row in result:
        print(row)

('10010', 'Brother Bear 2', datetime.date(2006, 8, 17), 73.0, ['Adventure', 'Animation', 'Family'], ['grizzly bear', 'human animal relationship', 'forest', 'hibernation', 'moose'], "Kenai finds his childhood human friend Nita and the two embark on a journey to burn the amulet he gave to her before he was a bear, much to Koda's dismay.", None, None, 10.861154, 6.3, 318, datetime.datetime(2023, 10, 21, 21, 33, 48, 25511))
('10012', 'Cursed', datetime.date(2005, 2, 25), 97.0, ['Horror', 'Comedy'], ['brother sister relationship', 'bite', 'transformation', 'supernatural powers', 'werewolf'], 'A werewolf loose in Los Angeles changes the lives of three young adults, who, after being mauled by the beast, learn they must kill their attacker if they hope to change their fate to avoid becoming werewolves too.', 35000000, 19294901, 8.949722, 5.1, 168, datetime.datetime(2023, 10, 21, 21, 33, 48, 25511))
('100402', 'Captain America: The Winter Soldier', datetime.date(2014, 3, 20), 136.0, ['Action', 

In [26]:
cnx.close()

In [31]:
import os
from sqlalchemy import MetaData, Table, Column, PrimaryKeyConstraint, Engine, create_engine
from sqlalchemy.types import ARRAY, Date, DateTime, Double, Integer, Text
from google.cloud.sql.connector import Connector
from dotenv import load_dotenv
from pg8000 import Connection

def make_connection() -> Connection:
    """generate a new pg8000 connection"""

    project = "robot-ebert"
    region = "us-west1"
    instance = "robot-ebert"
    instance_connection_string = f"{project}:{region}:{instance}"

    connector = Connector()
    cnx = connector.connect(
        instance_connection_string=instance_connection_string,
        driver="pg8000",
        user="postgres",
        password=os.environ["POSTGRES_PASSWORD"],
        db="app"
    )
    return cnx


def get_engine(echo: bool = False) -> Engine:
    """get a new SQLAlchemy Engine to manage DB connections"""

    engine = create_engine("postgresql+pg8000://", creator=make_connection, echo=echo)
    return engine



In [38]:
engine = get_engine(echo=False)

In [39]:
with engine.connect() as cnx:
    result = cnx.execute(text("SELECT COUNT(*) FROM movies"))
    for row in result:
        print(row)

(0,)
