### Notebook Set-Up

#### Import Required Modules

In [None]:
# %pip install python-dotenv sqlalchemy duckdb duckdb-engine cloud-sql-python-connector[pg8000]

In [3]:
import os
import csv
import json
# import duckdb

from typing import List, Dict
from dotenv import load_dotenv
from datetime import datetime

import numpy as np
import pandas as pd

# from pyspark.sql import SparkSession
# from pyspark.sql import functions as f
# from pyspark.sql.window import Window

#### Load Secrets

In [7]:
load_dotenv("../.env")

True

#### Get or Create SparkSession

In [None]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("error")
spark

#### Set Workflow Constants

In [None]:
CLEAN_PATH = "dbfs:/FileStore/data/clean"
JSON_PATH = "/dbfs/FileStore/data/json"

### Load the Clean Data

#### Import Clean Movies Data

In [None]:
movies = spark.read.parquet(os.path.join(CLEAN_PATH, "movies"))
movies.show(1, truncate=False, vertical=True)
movies.printSchema()
movies.count()

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 budget        | 4000000                                                                                                                                                                                                                                       
 cast          | [Tim Roth, Jennifer Beals, Antonio Banderas, Valeria Golino, David Proval]                                                                                                                                                                    
 director      | Allison Anders                                                                                                                                                                                                         

#### Import Clean Ratings Data

In [None]:
ratings = spark.read.parquet(os.path.join(CLEAN_PATH, "ratings"))
ratings.show(5, truncate=False)
ratings.count()

+-------+-------+------+-------------------+
|user_id|tmdb_id|rating|timestamp          |
+-------+-------+------+-------------------+
|304    |1572   |4.0   |1998-03-29 12:12:08|
|469    |2108   |5.0   |2000-08-04 21:46:05|
|247    |98     |4.0   |2016-07-04 14:55:39|
|307    |1592   |4.0   |2007-10-29 13:08:15|
|477    |2048   |3.5   |2008-01-21 18:45:47|
+-------+-------+------+-------------------+
only showing top 5 rows

Out[59]: 81116

### Populate the ApplicationDB Tables

In [None]:
updated_at = datetime.now()
updated_at

Out[60]: datetime.datetime(2023, 10, 24, 22, 21, 59, 26204)

#### Format the `Users` Table

In [None]:
users_pandas = ratings \
    .select("user_id") \
    .distinct() \
    .withColumn("email", f.lit("ANONYMOUS@ANONYMOUS.COM")) \
    .withColumn("hashed_password", f.sha2("email", 256)) \
    .withColumn("fname", f.lit("ANONYMOUS")) \
    .withColumn("lname", f.lit("ANONYMOUS")) \
    .withColumn("updated_at", f.lit(updated_at)) \
    .select("user_id", "email", "hashed_password", "fname", "lname", "updated_at") \
    .toPandas()

users_pandas.head()

Unnamed: 0,user_id,email,hashed_password,fname,lname,updated_at
0,467,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026204
1,296,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026204
2,125,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026204
3,451,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026204
4,51,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026204


#### Format the `Movies` Table

In [None]:
movies_pandas = movies \
    .drop("status") \
    .withColumnRenamed("cast", "actors") \
    .withColumn("updated_at", f.lit(updated_at)) \
    .select(
        "tmdb_id", "tmdb_homepage", "title", "language", "release_date", "runtime", "director", "actors", "genres", 
        "keywords", "overview", "budget", "revenue", "popularity","vote_average", "vote_count", "updated_at"
    ) \
    .toPandas()
movies_pandas.head()

Unnamed: 0,tmdb_id,tmdb_homepage,title,language,release_date,runtime,director,actors,genres,keywords,overview,budget,revenue,popularity,vote_average,vote_count,updated_at
0,5,https://www.themoviedb.org/movie/5,Four Rooms,en,1995-12-09,98,Allison Anders,"[Tim Roth, Jennifer Beals, Antonio Banderas, V...",[Comedy],"[hotel, new year's eve, witch, bet, hotel room...",It's Ted the Bellhop's first night on the job....,4000000,4257354,21.138,5.789,2443,2023-10-24 22:21:59.026204
1,11,https://www.themoviedb.org/movie/11,Star Wars,en,1977-05-25,121,George Lucas,"[Mark Hamill, Harrison Ford, Carrie Fisher, Pe...","[Adventure, Action, Science Fiction]","[android, galaxy, hermit, smuggling (contraban...",Princess Leia is captured and held hostage by ...,11000000,775398007,88.613,8.204,19236,2023-10-24 22:21:59.026204
2,12,https://www.themoviedb.org/movie/12,Finding Nemo,en,2003-05-30,100,Andrew Stanton,"[Albert Brooks, Ellen DeGeneres, Alexander Gou...","[Animation, Family]","[parent child relationship, sydney, australia,...","Nemo, an adventurous young clownfish, is unexp...",94000000,940335536,92.459,7.824,18132,2023-10-24 22:21:59.026204
3,13,https://www.themoviedb.org/movie/13,Forrest Gump,en,1994-06-23,142,Robert Zemeckis,"[Tom Hanks, Robin Wright, Gary Sinise, Sally F...","[Comedy, Drama, Romance]","[vietnam veteran, post-traumatic stress disord...",A man with a low IQ has accomplished great thi...,55000000,677387716,84.723,8.477,25544,2023-10-24 22:21:59.026204
4,14,https://www.themoviedb.org/movie/14,American Beauty,en,1999-09-15,122,Sam Mendes,"[Kevin Spacey, Annette Bening, Thora Birch, We...",[Drama],"[adultery, age difference, parent child relati...","Lester Burnham, a depressed suburban father in...",15000000,356296601,35.477,8.021,11317,2023-10-24 22:21:59.026204


#### Format the `Ratings` Table

In [None]:
ratings_pandas = ratings.drop('timestamp').withColumn("updated_at", f.lit(updated_at)).toPandas()
ratings_pandas.head()

Unnamed: 0,user_id,tmdb_id,rating,updated_at
0,304,1572,4.0,2023-10-24 22:21:59.026204
1,469,2108,5.0,2023-10-24 22:21:59.026204
2,247,98,4.0,2023-10-24 22:21:59.026204
3,307,1592,4.0,2023-10-24 22:21:59.026204
4,477,2048,3.5,2023-10-24 22:21:59.026204


#### Save the Formatted Data as JSONLines

In [None]:
users_pandas.to_json(os.path.join(JSON_PATH, "users.json"), orient="records", lines=True, date_format="iso")

In [None]:
movies_pandas.to_json(os.path.join(JSON_PATH, "movies.json"), orient="records", lines=True, date_format="iso")

In [None]:
ratings_pandas.to_json(os.path.join(JSON_PATH, "ratings.json"), orient="records", lines=True, date_format="iso")

#### Read the Formatted Data as JSONLines

In [None]:
users = pd.read_json(os.path.join(JSON_PATH, "users.json"), orient="records", lines=True)
users.head()

Unnamed: 0,user_id,email,hashed_password,fname,lname,updated_at
0,467,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026000+00:00
1,296,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026000+00:00
2,125,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026000+00:00
3,451,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026000+00:00
4,51,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 22:21:59.026000+00:00


In [None]:
movies = pd.read_json(os.path.join(JSON_PATH, "movies.json"), orient="records", lines=True)
movies.head()

Unnamed: 0,tmdb_id,tmdb_homepage,title,language,release_date,runtime,director,actors,genres,keywords,overview,budget,revenue,popularity,vote_average,vote_count,updated_at
0,5,https://www.themoviedb.org/movie/5,Four Rooms,en,1995-12-09,98,Allison Anders,"[Tim Roth, Jennifer Beals, Antonio Banderas, V...",[Comedy],"[hotel, new year's eve, witch, bet, hotel room...",It's Ted the Bellhop's first night on the job....,4000000,4257354,21.138,5.789,2443,2023-10-24 22:21:59.026000+00:00
1,11,https://www.themoviedb.org/movie/11,Star Wars,en,1977-05-25,121,George Lucas,"[Mark Hamill, Harrison Ford, Carrie Fisher, Pe...","[Adventure, Action, Science Fiction]","[android, galaxy, hermit, smuggling (contraban...",Princess Leia is captured and held hostage by ...,11000000,775398007,88.613,8.204,19236,2023-10-24 22:21:59.026000+00:00
2,12,https://www.themoviedb.org/movie/12,Finding Nemo,en,2003-05-30,100,Andrew Stanton,"[Albert Brooks, Ellen DeGeneres, Alexander Gou...","[Animation, Family]","[parent child relationship, sydney, australia,...","Nemo, an adventurous young clownfish, is unexp...",94000000,940335536,92.459,7.824,18132,2023-10-24 22:21:59.026000+00:00
3,13,https://www.themoviedb.org/movie/13,Forrest Gump,en,1994-06-23,142,Robert Zemeckis,"[Tom Hanks, Robin Wright, Gary Sinise, Sally F...","[Comedy, Drama, Romance]","[vietnam veteran, post-traumatic stress disord...",A man with a low IQ has accomplished great thi...,55000000,677387716,84.723,8.477,25544,2023-10-24 22:21:59.026000+00:00
4,14,https://www.themoviedb.org/movie/14,American Beauty,en,1999-09-15,122,Sam Mendes,"[Kevin Spacey, Annette Bening, Thora Birch, We...",[Drama],"[adultery, age difference, parent child relati...","Lester Burnham, a depressed suburban father in...",15000000,356296601,35.477,8.021,11317,2023-10-24 22:21:59.026000+00:00


In [None]:
ratings = pd.read_json(os.path.join(JSON_PATH, "ratings.json"), orient="records", lines=True)
ratings.head()

Unnamed: 0,user_id,tmdb_id,rating,updated_at
0,304,1572,4.0,2023-10-24 22:21:59.026000+00:00
1,469,2108,5.0,2023-10-24 22:21:59.026000+00:00
2,247,98,4.0,2023-10-24 22:21:59.026000+00:00
3,307,1592,4.0,2023-10-24 22:21:59.026000+00:00
4,477,2048,3.5,2023-10-24 22:21:59.026000+00:00


#### Create a DB Engine for SQLAlchemy Insertion

In [None]:
# FIXME

#### Insert Into the Database

In [None]:
# users.to_sql(name="users", con=engine, index=False, if_exists="append", chunksize=1000, method="multi")

In [None]:
# movies.to_sql(name="movies", con=engine, index=False, if_exists="append", chunksize=1000, method="multi")

In [None]:
# ratings.to_sql(name="ratings", con=engine, index=False, if_exists="append", chunksize=1000, method="multi")

In [5]:
from sqlalchemy import insert, select, update, delete
from app import database

ModuleNotFoundError: No module named 'app'

In [None]:
with engine.begin() as cnx:
    for m, movie in enumerate(movies.to_dict(orient="records")):
        try:
            cnx.execute(insert(database.movies).values(**movie))
        except DatabaseError:
            print(f"error with tmdb_id={movie['tmdb_id']}")
        if m % 10 == 0:
            print(f"iteration={m} tmdb_id={movie['tmdb_id']}")

### Start Sandbox Code

In [10]:
import os

from argparse import ArgumentParser
from sqlalchemy import MetaData, Table, Column, PrimaryKeyConstraint, Engine, create_engine, text
from sqlalchemy.types import ARRAY, BIGINT, Date, DateTime, Double, Integer, Text
from google.cloud.sql.connector import Connector
from dotenv import load_dotenv
from pg8000 import Connection


load_dotenv()


def make_connection() -> Connection:
    """generate a new pg8000 connection for a CloudSQL instance"""

    project = "robot-ebert"
    region = "us-west1"
    instance = "robot-ebert"
    instance_connection_string = f"{project}:{region}:{instance}"

    connector = Connector()
    cnx = connector.connect(
        instance_connection_string=instance_connection_string,
        driver="pg8000",
        user="postgres",
        password=os.environ["POSTGRES_PASSWORD"],
        db="app"
    )
    return cnx


def get_prod_engine(echo: bool = False) -> Engine:
    """get a new SQLAlchemy Engine to manage DB connections to the application CloudSQL database"""

    engine = create_engine("postgresql+pg8000://", creator=make_connection, echo=echo)
    return engine

In [9]:
engine = get_prod_engine(echo=True)

In [15]:
with engine.begin() as cnx:
    statement = text("SELECT * FROM movies LIMIT 10")
    movies = [row._asdict() for row in cnx.execute(statement).all()]

2023-12-24 17:05:01,051 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-12-24 17:05:01,057 INFO sqlalchemy.engine.Engine SELECT * FROM movies LIMIT 10
2023-12-24 17:05:01,059 INFO sqlalchemy.engine.Engine [cached since 147.8s ago] ()
2023-12-24 17:05:01,269 INFO sqlalchemy.engine.Engine COMMIT


In [16]:
movies

[{'tmdb_id': '5',
  'tmdb_homepage': 'https://www.themoviedb.org/movie/5',
  'title': 'Four Rooms',
  'language': 'en',
  'release_date': datetime.date(1995, 12, 9),
  'runtime': 98,
  'director': 'Allison Anders',
  'actors': ['Tim Roth',
   'Jennifer Beals',
   'Antonio Banderas',
   'Valeria Golino',
   'David Proval'],
  'genres': ['Comedy'],
  'keywords': ['hotel',
   "new year's eve",
   'witch',
   'bet',
   'hotel room',
   'sperm',
   'anthology',
   'los angeles, california',
   'hoodlum',
   'multiple storylines',
   'woman director'],
  'overview': "It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another.",
  'budget': 4000000,
  'revenue': 4257354,
  'popularity': 21.138,
  'vote_average': 5.789,
  'vote_count': 2443,
  'updated_at': datetime.datetime(2023, 10, 24, 21, 38, 29, 594000)},
 {'tmdb_