### Notebook Set-Up

#### Import Required Modules

In [None]:
# %pip install sqlalchemy duckdb duckdb-engine cloud-sql-python-connector[pg8000]

In [1]:
import os
import sys
import csv
import json
import duckdb

from typing import List, Dict
from dotenv import load_dotenv
from datetime import datetime

import numpy as np
import pandas as pd

# from pyspark.sql import SparkSession
# from pyspark.sql import functions as f
# from pyspark.sql.window import Window

#### Load Secrets as Environment Variables

In [2]:
load_dotenv()

True

#### Set Workflow Constants

In [3]:
RAW_PATH = "dbfs:/FileStore/data/raw"
CLN_PATH = "dbfs:/FileStore/data/clean"
JSON_PATH = "../data/json"

### Load Static Data into the Application Database

#### Import Static Data in JSONLines Format

In [4]:
users = pd.read_json(os.path.join(JSON_PATH, "users.json"), orient="records", lines=True)
users.head()

Unnamed: 0,user_id,email,hashed_password,fname,lname,updated_at
0,467,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 21:38:29.594000+00:00
1,296,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 21:38:29.594000+00:00
2,125,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 21:38:29.594000+00:00
3,451,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 21:38:29.594000+00:00
4,51,ANONYMOUS@ANONYMOUS.COM,37dac418e085726bf77f285c48558dcf9b3486ed184386...,ANONYMOUS,ANONYMOUS,2023-10-24 21:38:29.594000+00:00


In [5]:
movies = pd.read_json(os.path.join(JSON_PATH, "movies.json"), orient="records", lines=True)
movies.head()

Unnamed: 0,tmdb_id,tmdb_homepage,title,language,release_date,runtime,director,actors,genres,keywords,overview,budget,revenue,popularity,vote_average,vote_count,updated_at
0,5,https://www.themoviedb.org/movie/5,Four Rooms,en,1995-12-09,98,Allison Anders,"[Tim Roth, Jennifer Beals, Antonio Banderas, V...",[Comedy],"[hotel, new year's eve, witch, bet, hotel room...",It's Ted the Bellhop's first night on the job....,4000000,4257354,21.138,5.789,2443,2023-10-24 21:38:29.594000+00:00
1,11,https://www.themoviedb.org/movie/11,Star Wars,en,1977-05-25,121,George Lucas,"[Mark Hamill, Harrison Ford, Carrie Fisher, Pe...","[Adventure, Action, Science Fiction]","[android, galaxy, hermit, smuggling (contraban...",Princess Leia is captured and held hostage by ...,11000000,775398007,88.613,8.204,19236,2023-10-24 21:38:29.594000+00:00
2,12,https://www.themoviedb.org/movie/12,Finding Nemo,en,2003-05-30,100,Andrew Stanton,"[Albert Brooks, Ellen DeGeneres, Alexander Gou...","[Animation, Family]","[parent child relationship, sydney, australia,...","Nemo, an adventurous young clownfish, is unexp...",94000000,940335536,92.459,7.824,18132,2023-10-24 21:38:29.594000+00:00
3,13,https://www.themoviedb.org/movie/13,Forrest Gump,en,1994-06-23,142,Robert Zemeckis,"[Tom Hanks, Robin Wright, Gary Sinise, Sally F...","[Comedy, Drama, Romance]","[vietnam veteran, post-traumatic stress disord...",A man with a low IQ has accomplished great thi...,55000000,677387716,84.723,8.477,25544,2023-10-24 21:38:29.594000+00:00
4,14,https://www.themoviedb.org/movie/14,American Beauty,en,1999-09-15,122,Sam Mendes,"[Kevin Spacey, Annette Bening, Thora Birch, We...",[Drama],"[adultery, age difference, parent child relati...","Lester Burnham, a depressed suburban father in...",15000000,356296601,35.477,8.021,11317,2023-10-24 21:38:29.594000+00:00


In [6]:
ratings = pd.read_json(os.path.join(JSON_PATH, "ratings.json"), orient="records", lines=True)
ratings.head()

Unnamed: 0,user_id,tmdb_id,rating,updated_at
0,304,1572,4.0,2023-10-24 21:38:29.594000+00:00
1,469,2108,5.0,2023-10-24 21:38:29.594000+00:00
2,247,98,4.0,2023-10-24 21:38:29.594000+00:00
3,307,1592,4.0,2023-10-24 21:38:29.594000+00:00
4,477,2048,3.5,2023-10-24 21:38:29.594000+00:00


#### Create a SQLAlchemy Database Engine

In [9]:
import os
from sqlalchemy import create_engine, Engine
from google.cloud.sql.connector import Connector
from dotenv import load_dotenv
from pg8000 import Connection
from pg8000 import DatabaseError

In [10]:
def make_connection() -> Connection:
    """generate a new pg8000 connection for a CloudSQL instance"""

    project = "robot-ebert"
    region = "us-west1"
    instance = "robot-ebert"
    instance_connection_string = f"{project}:{region}:{instance}"

    connector = Connector()
    cnx = connector.connect(
        instance_connection_string=instance_connection_string,
        driver="pg8000",
        user="postgres",
        password=os.environ["POSTGRES_PASSWORD"],
        db="app"
    )
    return cnx


def get_prod_engine(echo: bool = False) -> Engine:
    """get a new SQLAlchemy Engine to manage DB connections to the application CloudSQL database"""

    engine = create_engine("postgresql+pg8000://", creator=make_connection, echo=echo)
    return engine


def get_test_engine(echo: bool = False) -> Engine:
    """get a new SQLAlchemy Engine to manage DB connections to a local test DuckDB database"""

    engine = create_engine("duckdb:///database.duckdb", echo=echo)
    return engine

In [11]:
engine = get_prod_engine(echo=False)
engine

Engine(postgresql+pg8000://)

#### Insert Static Data into the Application Database

In [12]:
users.to_sql(name="users", con=engine, index=False, if_exists="append", chunksize=1000, method="multi")

610

In [21]:
# movies.to_sql(name="movies", con=engine, index=False, if_exists="append", chunksize=1000, method="multi")

In [14]:
ratings.to_sql(name="ratings", con=engine, index=False, if_exists="append", chunksize=1000, method="multi")

81116

#### Insert Static Data Using SQLAlchemy

In [15]:
movies.to_dict(orient="records")[0]

{'tmdb_id': 5,
 'tmdb_homepage': 'https://www.themoviedb.org/movie/5',
 'title': 'Four Rooms',
 'language': 'en',
 'release_date': '1995-12-09',
 'runtime': 98,
 'director': 'Allison Anders',
 'actors': ['Tim Roth',
  'Jennifer Beals',
  'Antonio Banderas',
  'Valeria Golino',
  'David Proval'],
 'genres': ['Comedy'],
 'keywords': ['hotel',
  "new year's eve",
  'witch',
  'bet',
  'hotel room',
  'sperm',
  'anthology',
  'los angeles, california',
  'hoodlum',
  'multiple storylines',
  'woman director'],
 'overview': "It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another.",
 'budget': 4000000,
 'revenue': 4257354,
 'popularity': 21.138,
 'vote_average': 5.789,
 'vote_count': 2443,
 'updated_at': Timestamp('2023-10-24 21:38:29.594000+0000', tz='UTC')}

In [16]:
sys.path.append("..")

In [17]:
from sqlalchemy import insert, select, update, delete
from app import database

In [22]:
with engine.begin() as cnx:
    for m, movie in enumerate(movies.to_dict(orient="records")):
        try:
            cnx.execute(insert(database.movies).values(**movie))
        except DatabaseError:
            print(f"error with tmdb_id={movie['tmdb_id']}")
        if m % 10 == 0:
            print(f"iteration={m} tmdb_id={movie['tmdb_id']}")

iteration=0 tmdb_id=5
iteration=10 tmdb_id=24
iteration=20 tmdb_id=63
iteration=30 tmdb_id=75
iteration=40 tmdb_id=88
iteration=50 tmdb_id=103
iteration=60 tmdb_id=114
iteration=70 tmdb_id=129
iteration=80 tmdb_id=150
iteration=90 tmdb_id=165
iteration=100 tmdb_id=177
iteration=110 tmdb_id=196
iteration=120 tmdb_id=215
iteration=130 tmdb_id=238
iteration=140 tmdb_id=252
iteration=150 tmdb_id=278
iteration=160 tmdb_id=293
iteration=170 tmdb_id=310
iteration=180 tmdb_id=334
iteration=190 tmdb_id=378
iteration=200 tmdb_id=395
iteration=210 tmdb_id=409
iteration=220 tmdb_id=429
iteration=230 tmdb_id=473
iteration=240 tmdb_id=497
iteration=250 tmdb_id=526
iteration=260 tmdb_id=548
iteration=270 tmdb_id=564
iteration=280 tmdb_id=581
iteration=290 tmdb_id=592
iteration=300 tmdb_id=603
iteration=310 tmdb_id=616
iteration=320 tmdb_id=628
iteration=330 tmdb_id=643
iteration=340 tmdb_id=657
iteration=350 tmdb_id=673
iteration=360 tmdb_id=688
iteration=370 tmdb_id=703
iteration=380 tmdb_id=714
ite

### Test Out Query Functionality Using SQLAlchemy

In [71]:
from sqlalchemy import text

In [93]:
%time

with engine.connect() as cnx:
    result = cnx.execute(text("SELECT * FROM ratings WHERE user_id = '1'"))
    for row in result:
        print(row)

CPU times: user 5 µs, sys: 2 µs, total: 7 µs
Wall time: 13.8 µs
('1', '272', 4.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '9685', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '2108', 4.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '64678', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '84892', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '12444', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '13475', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '141', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '1726', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '9377', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '238', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '240', 5.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))
('1', '68718', 4.0, datetime.datetime(2023, 10, 23, 18, 44, 55, 294000))