In [15]:
import os
import re
from datetime import date
import ast

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

# %load_ext nb_black


<IPython.core.display.Javascript object>

In [16]:
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")

# %load_ext sql
%sql postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")


<IPython.core.display.Javascript object>

# Loading data:

## Reading CSVs into dfs:

In [17]:
best_shows_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/best_shows.csv')
best_movies_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/Best_Movies.csv')
best_movies_yearly_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/Best_Movie_Yearly.csv')
best_shows_yearly_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/Best_Show_Yearly.csv')
raw_credits_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/raw_credits.csv')
raw_titles_df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/recommender_db/data/raw_titles.csv')

<IPython.core.display.Javascript object>

Changing titles to lowercase:

In [18]:
all_dfs = [best_shows_df, best_movies_df, best_movies_yearly_df, best_shows_yearly_df, raw_credits_df, raw_titles_df]
for df in all_dfs:
    df.rename(columns=lambda x: x.lower(), inplace=True)

<IPython.core.display.Javascript object>

In [20]:
dfs = {
    'best_shows_df': best_shows_df,
    'best_movies_df': best_movies_df,
    'best_movies_yearly_df': best_movies_yearly_df,
    'best_shows_yearly_df': best_shows_yearly_df,
    'raw_credits_df': raw_credits_df,
    'raw_titles_df': raw_titles_df,
}

for df_name, df in dfs.items():
    print(f'{df_name}:')
    for col in df.columns:
        max_length = df[col].astype(str).apply(len).max()
        print(f'    {col}: max length is {max_length}')


best_shows_df:
    index: max length is 3
    title: max length is 48
    release_year: max length is 4
    score: max length is 3
    number_of_votes: max length is 7
    duration: max length is 3
    number_of_seasons: max length is 2
    main_genre: max length is 11
    main_production: max length is 2
best_movies_df:
    index: max length is 3
    title: max length is 104
    release_year: max length is 4
    score: max length is 3
    number_of_votes: max length is 7
    duration: max length is 3
    main_genre: max length is 11
    main_production: max length is 2
best_movies_yearly_df:
    index: max length is 2
    title: max length is 46
    release_year: max length is 4
    score: max length is 3
    main_genre: max length is 11
    main_production: max length is 2
best_shows_yearly_df:
    index: max length is 2
    title: max length is 28
    release_year: max length is 4
    score: max length is 3
    number_of_seasons: max length is 2
    main_genre: max length is 11
    

    index: max length is 5
    person_id: max length is 7
    id: max length is 9
    name: max length is 73
    character: max length is 298
    role: max length is 8
raw_titles_df:
    index: max length is 4
    id: max length is 9
    title: max length is 104
    type: max length is 5
    release_year: max length is 4
    age_certification: max length is 5
    runtime: max length is 3
    genres: max length is 96
    production_countries: max length is 42
    seasons: max length is 4
    imdb_id: max length is 10
    imdb_score: max length is 3
    imdb_votes: max length is 9


<IPython.core.display.Javascript object>

## Creating db tables
I have not optimized data types, because this is not the actual database. This is just the raw data as requested by analysts. 

In [None]:
%%sql

DROP TABLE IF EXISTS raw_best_shows;
CREATE TABLE raw_best_shows (
    title VARCHAR(48),
    release_year INT,
    score INT,
    number_of_votes INT,
    duration INT,
    number_of_seasons INT,
    main_genre VARCHAR(11),
    main_production INT
);

DROP TABLE IF EXISTS raw_best_movies;
CREATE TABLE raw_best_movies (
    title VARCHAR(104),
    release_year INT,
    score INT,
    number_of_votes INT,
    duration INT,
    main_genre VARCHAR(11),
    main_production INT
);

DROP TABLE IF EXISTS raw_best_movies_yearly;
CREATE TABLE raw_best_movies_yearly (
    title VARCHAR(46),
    release_year INT,
    score INT,
    main_genre VARCHAR(11),
    main_production INT
);

DROP TABLE IF EXISTS raw_best_shows_yearly;
CREATE TABLE raw_best_shows_yearly (
    title VARCHAR(28),
    release_year INT,
    score INT,
    number_of_seasons INT,
    main_genre VARCHAR(11),
    main_production INT
);

DROP TABLE IF EXISTS raw_credits;
CREATE TABLE raw_credits (
    person_id INT,
    id INT,
    name VARCHAR(73),
    character VARCHAR(298),
    role VARCHAR(8)
);

DROP TABLE IF EXISTS raw_titles;
CREATE TABLE raw_titles (
    id INT,
    title VARCHAR(104),
    type VARCHAR(5),
    release_year INT,
    age_certification VARCHAR(5),
    runtime INT,
    genres VARCHAR(96),
    production_countries VARCHAR(42),
    seasons INT,
    imdb_id VARCHAR(10),
    imdb_score INT,
    imdb_votes INT
);




 * postgresql://postgres:***@localhost/recommender
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

## Loading df data into tables

In [None]:
table_names = ['raw_best_shows', 'raw_best_movies', 'raw_best_movies_yearly', 'raw_best_shows_yearly', 'raw_credits', 'raw_titles']

for df, table_name in zip(all_dfs, table_names):
    df.rename(columns=lambda x: x.lower(), inplace=True)
    df.to_sql(table_name, engine, index=False, if_exists='replace')

print("Data loaded successfully!")

Data loaded successfully!


## Creating read only user for analyst to view raw data:

In [None]:
"""
%%sql
REVOKE ALL PRIVILEGES ON DATABASE recommender FROM movie_reader;
REVOKE ALL PRIVILEGES ON SCHEMA public FROM movie_reader;
REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA public FROM movie_reader;
DROP USER IF EXISTS movie_reader;


CREATE USER movie_reader WITH PASSWORD 'read_only';
GRANT CONNECT ON DATABASE recommender TO movie_reader;
GRANT USAGE ON SCHEMA public TO movie_reader;
GRANT SELECT ON 
    public.raw_best_movies,
    public.raw_best_shows,
    public.raw_best_movies_yearly,
    public.raw_best_shows_yearly,
    public.raw_credits,
    public.raw_titles
TO movie_reader;

ALTER DEFAULT PRIVILEGES 
FOR USER movie_reader
IN SCHEMA public
GRANT SELECT ON TABLES TO movie_reader;
"""

In [None]:
%%sql

SELECT * FROM pg_catalog.pg_user;

 * postgresql://postgres:***@localhost/recommender
4 rows affected.


usename,usesysid,usecreatedb,usesuper,userepl,usebypassrls,passwd,valuntil,useconfig
postgres,10,True,True,True,True,********,,
car_reader,17330,False,False,False,False,********,,
book_reader,26796,False,False,False,False,********,,
movie_reader,118700,False,False,False,False,********,,
