In [7]:
import os
import re
from datetime import date
import ast

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

# %load_ext nb_black


In [25]:
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_NAME = os.getenv("DB_NAME")

connection_string = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}"
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}", connect_args={'options': '-csearch_path=raw'})

%load_ext sql
%sql engine

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


# Loading data:

## Reading CSVs into dfs:

In [9]:
best_shows_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/best_shows.csv')
best_movies_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/Best_Movies.csv')
best_movies_yearly_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/Best_Movie_Yearly.csv')
best_shows_yearly_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/Best_Show_Yearly.csv')
raw_credits_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/raw_credits.csv')
raw_titles_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/raw_titles.csv')

Changing titles to lowercase:

In [26]:
all_dfs = [best_shows_df, best_movies_df, best_movies_yearly_df, best_shows_yearly_df, raw_credits_df, raw_titles_df]
for df in all_dfs:
    df.rename(columns=lambda x: x.lower(), inplace=True)

In [27]:
dfs = {
    'best_shows_df': best_shows_df,
    'best_movies_df': best_movies_df,
    'best_movies_yearly_df': best_movies_yearly_df,
    'best_shows_yearly_df': best_shows_yearly_df,
    'raw_credits_df': raw_credits_df,
    'raw_titles_df': raw_titles_df,
}

for df_name, df in dfs.items():
    print(f'{df_name}:')
    for col in df.columns:
        max_length = df[col].astype(str).apply(len).max()
        print(f'    {col}: max length is {max_length}')


best_shows_df:
    index: max length is 3
    title: max length is 48
    release_year: max length is 4
    score: max length is 3
    number_of_votes: max length is 7
    duration: max length is 3
    number_of_seasons: max length is 2
    main_genre: max length is 11
    main_production: max length is 2
best_movies_df:
    index: max length is 3
    title: max length is 104
    release_year: max length is 4
    score: max length is 3
    number_of_votes: max length is 7
    duration: max length is 3
    main_genre: max length is 11
    main_production: max length is 2
best_movies_yearly_df:
    index: max length is 2
    title: max length is 46
    release_year: max length is 4
    score: max length is 3
    main_genre: max length is 11
    main_production: max length is 2
best_shows_yearly_df:
    index: max length is 2
    title: max length is 28
    release_year: max length is 4
    score: max length is 3
    number_of_seasons: max length is 2
    main_genre: max length is 11
    

## Creating raw schema and creating raw db tables
I have not optimized data types, because this is not the actual database. This is just the raw data as requested by analysts. 

In [28]:
%%sql

DROP SCHEMA IF EXISTS raw CASCADE;
CREATE SCHEMA raw;

In [29]:
%%sql

DROP TABLE IF EXISTS raw.best_shows;
CREATE TABLE raw.best_shows (
    title VARCHAR(48),
    release_year INT,
    score INT,
    number_of_votes INT,
    duration INT,
    number_of_seasons INT,
    main_genre VARCHAR(11),
    main_production INT
);

DROP TABLE IF EXISTS raw.best_movies;
CREATE TABLE raw.best_movies (
    title VARCHAR(104),
    release_year INT,
    score INT,
    number_of_votes INT,
    duration INT,
    main_genre VARCHAR(11),
    main_production INT
);

DROP TABLE IF EXISTS raw.best_movies_yearly;
CREATE TABLE raw.best_movies_yearly (
    title VARCHAR(46),
    release_year INT,
    score INT,
    main_genre VARCHAR(11),
    main_production INT
);

DROP TABLE IF EXISTS raw.best_shows_yearly;
CREATE TABLE raw.best_shows_yearly (
    title VARCHAR(28),
    release_year INT,
    score INT,
    number_of_seasons INT,
    main_genre VARCHAR(11),
    main_production INT
);

DROP TABLE IF EXISTS raw.credits;
CREATE TABLE raw.credits (
    person_id INT,
    id INT,
    name VARCHAR(73),
    character VARCHAR(298),
    role VARCHAR(8)
);

DROP TABLE IF EXISTS raw.titles;
CREATE TABLE raw.titles (
    id INT,
    title VARCHAR(104),
    type VARCHAR(5),
    release_year INT,
    age_certification VARCHAR(5),
    runtime INT,
    genres VARCHAR(96),
    production_countries VARCHAR(42),
    seasons INT,
    imdb_id VARCHAR(10),
    imdb_score INT,
    imdb_votes INT
);




## Loading df data into tables

In [30]:
table_names = ['raw.best_shows', 'raw.best_movies', 'raw.best_movies_yearly', 'raw.best_shows_yearly', 'raw.credits', 'raw.titles']

for df, table_name in zip(all_dfs, table_names):
    df.rename(columns=lambda x: x.lower(), inplace=True)
    df.to_sql(table_name, engine, index=False, if_exists='append')

print("Data loaded successfully!")

Data loaded successfully!


## Creating read only user for analyst to view raw data:

In [31]:
"""
%%sql
REVOKE ALL PRIVILEGES ON DATABASE recommender FROM analyst_reader;
REVOKE ALL PRIVILEGES ON SCHEMA raw FROM analyst_reader;
REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA raw FROM analyst_reader;
DROP USER IF EXISTS analyst_reader;

CREATE USER analyst_reader WITH PASSWORD 'read_only';
GRANT CONNECT ON DATABASE recommender TO analyst_reader;
GRANT USAGE ON SCHEMA raw TO analyst_reader;
GRANT SELECT ON 
    raw.best_movies,
    raw.best_shows,
    raw.best_movies_yearly,
    raw.best_shows_yearly,
    raw.credits,
    raw.titles
TO analyst_reader;

ALTER DEFAULT PRIVILEGES 
FOR USER analyst_reader
IN SCHEMA raw
GRANT SELECT ON TABLES TO analyst_reader;
"""

SyntaxError: invalid syntax (2343554248.py, line 1)

In [33]:
%%sql

SELECT * FROM pg_catalog.pg_user;

usename,usesysid,usecreatedb,usesuper,userepl,usebypassrls,passwd,valuntil,useconfig
postgres,10,True,True,True,True,********,,
car_reader,17330,False,False,False,False,********,,
book_reader,26796,False,False,False,False,********,,
analyst_reader,129719,False,False,False,False,********,,
