In [1]:
import os

from dotenv import load_dotenv
import pandas as pd
from sqlalchemy import create_engine


In [2]:
# This is the ADMIN user connection script

load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_NAME = os.getenv("DB_NAME")

connection_string = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}"
engine = create_engine(
    f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}", 
    connect_args={'options': '-csearch_path=raw'}
    )

%load_ext sql
%sql engine

# Loading data:

## Reading CSVs into dfs:

In [3]:
best_shows_df = pd.read_csv('../../data/best_shows.csv')
best_movies_df = pd.read_csv('../../data/Best_Movies.csv')
best_movies_yearly_df = pd.read_csv('../../data/Best_Movie_Yearly.csv')
best_shows_yearly_df = pd.read_csv('../../data/Best_Show_Yearly.csv')
raw_credits_df = pd.read_csv('../../data/raw_credits.csv')
raw_titles_df = pd.read_csv('../../data/raw_titles.csv')

Changing titles to lowercase:

In [4]:
all_dfs = [best_shows_df, best_movies_df, best_movies_yearly_df, best_shows_yearly_df, raw_credits_df, raw_titles_df]
for df in all_dfs:
    df.rename(columns=lambda x: x.lower(), inplace=True)

In [5]:
dfs = {
    'best_shows_df': best_shows_df,
    'best_movies_df': best_movies_df,
    'best_movies_yearly_df': best_movies_yearly_df,
    'best_shows_yearly_df': best_shows_yearly_df,
    'raw_credits_df': raw_credits_df,
    'raw_titles_df': raw_titles_df,
}

for df_name, df in dfs.items():
    df.drop(columns=['index'], inplace=True)
    print(f'{df_name}:')
    for col in df.columns:
        max_length = df[col].astype(str).apply(len).max()
        print(f'    {col}: max length is {max_length}')


best_shows_df:
    title: max length is 48
    release_year: max length is 4
    score: max length is 3
    number_of_votes: max length is 7
    duration: max length is 3
    number_of_seasons: max length is 2
    main_genre: max length is 11
    main_production: max length is 2
best_movies_df:
    title: max length is 104
    release_year: max length is 4
    score: max length is 3
    number_of_votes: max length is 7
    duration: max length is 3
    main_genre: max length is 11
    main_production: max length is 2
best_movies_yearly_df:
    title: max length is 46
    release_year: max length is 4
    score: max length is 3
    main_genre: max length is 11
    main_production: max length is 2
best_shows_yearly_df:
    title: max length is 28
    release_year: max length is 4
    score: max length is 3
    number_of_seasons: max length is 2
    main_genre: max length is 11
    main_production: max length is 2
raw_credits_df:
    person_id: max length is 7
    id: max length is 9
    

## Creating raw schema and creating raw db tables
I have not optimized data types, because this is not the actual database. This is just the raw data as requested by analysts. 

In [6]:
%%sql

DROP SCHEMA IF EXISTS raw CASCADE;
CREATE SCHEMA raw;

In [7]:
%%sql

DROP TABLE IF EXISTS best_shows;
CREATE TABLE best_shows (
    title VARCHAR(48),
    release_year INT,
    score INT,
    number_of_votes INT,
    duration INT,
    number_of_seasons INT,
    main_genre VARCHAR(11),
    main_production VARCHAR(3)
);

DROP TABLE IF EXISTS best_movies;
CREATE TABLE best_movies (
    title VARCHAR(104),
    release_year INT,
    score INT,
    number_of_votes INT,
    duration INT,
    main_genre VARCHAR(11),
    main_production VARCHAR(3)
);

DROP TABLE IF EXISTS best_movies_yearly;
CREATE TABLE best_movies_yearly (
    title VARCHAR(46),
    release_year INT,
    score INT,
    main_genre VARCHAR(11),
    main_production VARCHAR(3)
);

DROP TABLE IF EXISTS best_shows_yearly;
CREATE TABLE best_shows_yearly (
    title VARCHAR(28),
    release_year INT,
    score INT,
    number_of_seasons INT,
    main_genre VARCHAR(11),
    main_production VARCHAR(3)
);

DROP TABLE IF EXISTS credits;
CREATE TABLE credits (
    person_id INT,
    id VARCHAR(10),
    name VARCHAR(73),
    character VARCHAR(298),
    role VARCHAR(8)
);

DROP TABLE IF EXISTS titles;
CREATE TABLE titles (
    id VARCHAR(10),
    title VARCHAR(104),
    type VARCHAR(5),
    release_year INT,
    age_certification VARCHAR(5),
    runtime INT,
    genres VARCHAR(96),
    production_countries VARCHAR(42),
    seasons INT,
    imdb_id VARCHAR(10),
    imdb_score INT,
    imdb_votes INT
);




## Loading df data into tables

In [8]:
table_names = ['best_shows', 'best_movies', 'best_movies_yearly', 'best_shows_yearly', 'credits', 'titles']

for df, table_name in zip(all_dfs, table_names):
    df.rename(columns=lambda x: x.lower(), inplace=True)
    df.to_sql(table_name, engine, index=False, if_exists='append')

print("Data loaded successfully!")

Data loaded successfully!
