## Create Sakila Dimension Tables Using MySQL

#### import libraries

In [1]:
import os
import numpy
import pandas as pd
from sqlalchemy import create_engine

#### declare and assign connnection variables for MySQL

In [2]:
host_name = "localhost"  # bc connecting to local instance
port = "3306"
user_id = "root"
pwd = "Passw0rd123"

src_dbname = "sakila"
dst_dbname = "sakila_etl"

#### functions for getting and setting database data

In [3]:
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

#### drop old and connect to new ETL database

In [4]:
conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)

sqlEngine.execute(f"DROP DATABASE IF EXISTS `{dst_dbname}`;")
sqlEngine.execute(f"CREATE DATABASE `{dst_dbname}`;")
sqlEngine.execute(f"USE {dst_dbname};")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1c5309d7b20>

#### dimension tables!

1) need date dimension, obtained by running SakilaDateDim.sql
2) date dimension will be integrated into fact table using date columns of fact table

#### dimension table part 1:  gather data

In [5]:
sql_staff = "SELECT * FROM sakila.staff;"
df_staff = get_dataframe(user_id, pwd, host_name, src_dbname, sql_staff)
df_staff.head(2)

Unnamed: 0,staff_id,first_name,last_name,address_id,picture,email,store_id,active,username,password,last_update
0,1,Mike,Hillyer,3,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,Mike.Hillyer@sakilastaff.com,1,1,Mike,8cb2237d0679ca88db6464eac60da96345513964,2006-02-15 03:57:16
1,2,Jon,Stephens,4,,Jon.Stephens@sakilastaff.com,2,1,Jon,,2006-02-15 03:57:16


In [6]:
sql_film = "SELECT * FROM sakila.film;"
df_film = get_dataframe(user_id, pwd, host_name, src_dbname, sql_film)
df_film.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42


#### dimension table part 2:  conform to data warehouse design

1) drop low-value columns, defined here as:  columns with many NULLs; columns with a lot of free text; columns with binary large object (BLOB) data, such as images; and columns that do not provide much analytic information
2) rename primary key of each dimension table to meaningful identifier

In [7]:
drop_cols = ['last_update','picture','username','password']
df_staff.drop(drop_cols, axis=1, inplace=True)
df_staff.rename(columns={"staff_id":"staff_key"}, inplace=True)

df_staff.head(2)

Unnamed: 0,staff_key,first_name,last_name,address_id,email,store_id,active
0,1,Mike,Hillyer,3,Mike.Hillyer@sakilastaff.com,1,1
1,2,Jon,Stephens,4,Jon.Stephens@sakilastaff.com,2,1


In [8]:
drop_cols = ['last_update', 'description','original_language_id']
df_film.drop(drop_cols, axis=1, inplace=True)
df_film.rename(columns={"film_id":"film_key"}, inplace=True)

df_film.head(2)

Unnamed: 0,film_key,title,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features
0,1,ACADEMY DINOSAUR,2006,1,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes"
1,2,ACE GOLDFINGER,2006,1,3,4.99,48,12.99,G,"Trailers,Deleted Scenes"


#### dimension tables part 3:  create tables and load into data warehouse

In [9]:
db_operation = "insert"

tables = [('dim_staff', df_staff, 'staff_key'),
          ('dim_film', df_film, 'film_key')]

In [10]:
for table_name, dataframe, primary_key in tables:
    set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)

#### dimension tables part 4:  queries to demonstrate functionality

must perform some type of aggregation and grouping

In [11]:
# this query finds whether the staff belong to store 1 to not
# since sakila is a toy dataset, there are only two staff members, one in store 1 and one in store 2:
# df_staff.shape
# so, the query returns expected values, though they seem small at first glance

sql_dim_staff = """
    SELECT CASE store_id
                WHEN 1 THEN 'yes'
                ELSE 'no'
            END AS is_store_1
        , COUNT(*) AS store_1_count
    FROM sakila_etl.dim_staff
    GROUP BY store_id;
"""

df_dim_staff = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_staff)
df_dim_staff

Unnamed: 0,is_store_1,store_1_count
0,yes,1
1,no,1


In [12]:
# this query finds movies with higher-than-average replacement cost, ordered from highest to lowest cost

sql_dim_film = """
    SELECT title
        , replacement_cost
    FROM sakila_etl.dim_film
    WHERE replacement_cost > (SELECT AVG(replacement_cost) FROM sakila_etl.dim_film)
    ORDER BY replacement_cost DESC;
"""

df_dim_film = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_film)
df_dim_film

Unnamed: 0,title,replacement_cost
0,ARABIA DOGMA,29.99
1,BALLROOM MOCKINGBIRD,29.99
2,BLINDNESS GUN,29.99
3,BONNIE HOLOCAUST,29.99
4,CHARIOTS CONSPIRACY,29.99
...,...,...
531,VICTORY ACADEMY,19.99
532,WALLS ARTIST,19.99
533,WARDROBE PHANTOM,19.99
534,WEREWOLF LOLA,19.99
