In [1]:
import psycopg2
from dotenv import load_dotenv
import os
import pandas as pd
from prettytable import PrettyTable

# Step 1 : Connect to db

In [2]:
%load_ext dotenv
%dotenv

In [3]:
connection = psycopg2.connect(
        host=os.getenv('DATABASE_HOST'),
        port=os.getenv('DATABASE_PORT'),
        user=os.getenv('DATABASE_USER'),
        password=os.getenv('DATABASE_PASSWORD'),
        dbname=os.getenv('DATABASE_NAME'),
    )
connection.autocommit = True
cursor = connection.cursor()

In [4]:
def select_query(query):
    cursor.execute(query)
    
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]

    # format the output nicely
    table = PrettyTable()
    table.field_names = columns
    for row in rows:
        table.add_row(row)
                      
    return table

In [28]:
def other_queries(query):
    cursor.execute(query)

# Step 2 : Invistigate db

In [5]:
query = ("""
    SELECT table_schema, table_name
    FROM information_schema.tables
    WHERE table_type = 'BASE TABLE' AND table_schema NOT IN ('pg_catalog', 'information_schema')
    ORDER BY table_schema, table_name;
""")
select_query(query)

table_schema,table_name
public,actor
public,address
public,category
public,city
public,country
public,customer
public,film
public,film_actor
public,film_category
public,inventory


![](images/3NF.png)

# What data sizes are we locking at?

In [6]:
nStores = ("""
    select count(*) from store;
""")
nStores = select_query(nStores)

nFilms = ("""
    select count(*) from film;
""")
nFilms = select_query(nFilms)

nCustomers = ("""
    select count(*) from customer;
""")
nCustomers = select_query(nCustomers)

nRentals = ("""
    select count(*) from rental;
""")
nRentals = select_query(nRentals)

nPayments = ("""
    select count(*) from payment;
""")
nPayments = select_query(nPayments)

nStaff = ("""
    select count(*) from staff;
""")
nStaff = select_query(nStaff)

nCity = ("""
    select count(*) from city;
""")
nCity = select_query(nCity)

nCountries = ("""
    select count(*) from country;
""")
nCountries = select_query(nCountries)


print("nFilms = ", nFilms[0][0])
print("nCustomers = ", nCustomers[0][0])
print("nRentals = ", nRentals[0][0])
print("nPayments = ", nPayments[0][0])
print("nStaff = ", nStaff[0][0])
print("nStores = ", nStores[0][0])
print("nCity = ", nCity[0][0])
print("nCountries = ", nCountries[0][0])

nFilms =  +-------+
| count |
+-------+
|  1000 |
+-------+
nCustomers =  +-------+
| count |
+-------+
|  599  |
+-------+
nRentals =  +-------+
| count |
+-------+
| 16044 |
+-------+
nPayments =  +-------+
| count |
+-------+
| 16049 |
+-------+
nStaff =  +-------+
| count |
+-------+
|   2   |
+-------+
nStores =  +-------+
| count |
+-------+
|   2   |
+-------+
nCity =  +-------+
| count |
+-------+
|  600  |
+-------+
nCountries =  +-------+
| count |
+-------+
|  109  |
+-------+


# when? what time period are we talking about?

In [7]:
when = ("""
    select * from payment limit 5;
""")
select_query(when)

payment_id,customer_id,staff_id,rental_id,amount,payment_date
16050,269,2,7,1.99,2007-01-24 21:40:19.996577
16051,269,1,98,0.99,2007-01-25 15:16:50.996577
16052,269,2,678,6.99,2007-01-28 21:44:14.996577
16053,269,2,703,0.99,2007-01-29 00:58:02.996577
16054,269,1,750,4.99,2007-01-29 08:10:06.996577


In [8]:
when = ("""
    select min(payment_date) as start, max(payment_date) as end from payment;
""")
select_query(when)

start,end
2007-01-24 21:21:56.996577,2007-05-14 13:44:29.996577


# Where? where do events in this db occure?
How many events in each district ?

In [9]:
where = ("""
    select * from address limit 5;
""")
select_query(where)

address_id,address,address2,district,city_id,postal_code,phone,last_update
1,47 MySakila Drive,,Alberta,300,,,2006-02-15 09:45:30
2,28 MySQL Boulevard,,QLD,576,,,2006-02-15 09:45:30
3,23 Workhaven Lane,,Alberta,300,,14033335568.0,2006-02-15 09:45:30
4,1411 Lillydale Drive,,QLD,576,,6172235589.0,2006-02-15 09:45:30
5,1913 Hanoi Way,,Nagasaki,463,35200.0,28303384290.0,2006-02-15 09:45:30


In [10]:
where = ("""
    select district, sum(city_id) as n
    from address
    group by district
    order by n desc
    limit 10;
""")
select_query(where)

district,n
Shandong,3237
England,2974
So Paulo,2952
West Bengali,2623
Buenos Aires,2572
Uttar Pradesh,2462
California,2444
Southern Tagalog,1931
Tamil Nadu,1807
Hubei,1790


# Step 3 : Perform some simple data analsyis

## 3.1 Insight 1: Top Grossing Movies
- Payments amounts are in table `payment`
- Movies are in table `film`
- They are not directly linked, `payment` refers to a `rental`, `rental` refers to an `inventory` and `inventory` refers to a `film`
- `payment` -> `rental` -> `inventory` -> `film`

In [11]:
query = ("""
    select film_id, title, release_year, rental_rate, rating from film limit 5;
""")
select_query(query)

film_id,title,release_year,rental_rate,rating
1,ACADEMY DINOSAUR,2006,0.99,PG
2,ACE GOLDFINGER,2006,4.99,G
3,ADAPTATION HOLES,2006,2.99,NC-17
4,AFFAIR PREJUDICE,2006,2.99,G
5,AFRICAN EGG,2006,2.99,G


In [12]:
query = ("""
    select * from inventory limit 5;
""")
select_query(query)

inventory_id,film_id,store_id,last_update
1,1,1,2006-02-15 10:09:17
2,1,1,2006-02-15 10:09:17
3,1,1,2006-02-15 10:09:17
4,1,1,2006-02-15 10:09:17
5,1,2,2006-02-15 10:09:17


In [13]:
query = ("""
    select * from rental limit 5;
""")
select_query(query)

rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-16 02:30:53
3,2005-05-24 23:03:39,1711,408,2005-06-01 22:12:39,1,2006-02-16 02:30:53
4,2005-05-24 23:04:41,2452,333,2005-06-03 01:43:41,2,2006-02-16 02:30:53
5,2005-05-24 23:05:21,2079,222,2005-06-02 04:33:21,1,2006-02-16 02:30:53
6,2005-05-24 23:08:07,2792,549,2005-05-27 01:32:07,1,2006-02-16 02:30:53


In [14]:
query = ("""
    select * from payment limit 5;
""")
select_query(query)

payment_id,customer_id,staff_id,rental_id,amount,payment_date
16050,269,2,7,1.99,2007-01-24 21:40:19.996577
16051,269,1,98,0.99,2007-01-25 15:16:50.996577
16052,269,2,678,6.99,2007-01-28 21:44:14.996577
16053,269,2,703,0.99,2007-01-29 00:58:02.996577
16054,269,1,750,4.99,2007-01-29 08:10:06.996577


### Get movies for every payment

In [15]:
query = ("""
    select f.film_id, f.title, p.payment_id, p.rental_id, i.inventory_id, p.customer_id, p.amount, p.payment_date
    from payment p
    join rental r on (p.rental_id = r.rental_id)
    join inventory i on (r.inventory_id = i.inventory_id)
    join film f on (i.film_id = f.film_id)
    limit 10;
""")
select_query(query)

film_id,title,payment_id,rental_id,inventory_id,customer_id,amount,payment_date
870,SWARM GOLD,16050,7,3995,269,1.99,2007-01-24 21:40:19.996577
651,PACKER MADIGAN,16051,98,2970,269,0.99,2007-01-25 15:16:50.996577
818,SOMETHING DUCK,16052,678,3741,269,6.99,2007-01-28 21:44:14.996577
249,DRACULA CRYSTAL,16053,703,1123,269,0.99,2007-01-29 00:58:02.996577
159,CLOSER BANG,16054,750,730,269,4.99,2007-01-29 08:10:06.996577
205,DANCES NONE,16055,1099,924,269,2.99,2007-01-31 12:23:14.996577
851,STRAIGHT HOURS,16056,193,3900,270,1.99,2007-01-26 05:10:14.996577
559,MARRIED GO,16057,1040,2546,270,4.99,2007-01-31 04:03:42.996577
237,DIVORCE SHINING,16058,1096,1066,271,8.99,2007-01-31 11:59:15.996577
367,GOLDMINE TYCOON,16059,33,1681,272,0.99,2007-01-25 02:47:17.996577


### Sum movies revenue

In [16]:
# Top 10 movies
query = ("""
    select f.film_id, f.title, sum(p.amount) as revenue
    from payment p
    join rental r on (p.rental_id = r.rental_id)
    join inventory i on (r.inventory_id = i.inventory_id)
    join film f on (i.film_id = f.film_id)
    group by f.film_id
    order by revenue desc
    limit 10;
""")
select_query(query)

film_id,title,revenue
879,TELEGRAPH VOYAGE,231.73
973,WIFE TURN,223.69
1000,ZORRO ARK,214.69
369,GOODFELLAS SALUTE,209.69
764,SATURDAY LAMBS,204.72
893,TITANS JERK,201.71
897,TORQUE BOUND,198.72
403,HARRY IDAHO,195.7
460,INNOCENT USUAL,191.74
444,HUSTLER PARTY,190.78


## 3.2 Insight 2: Top Grossing Cities
From which cities our customers by most
- payments amounts are in `payment` table
- cities are in `city` table
- `payment` -> `customer` -> `address` -> `city`

In [17]:
query = ("""
    select * from payment limit 5;
""")
select_query(query)

payment_id,customer_id,staff_id,rental_id,amount,payment_date
16050,269,2,7,1.99,2007-01-24 21:40:19.996577
16051,269,1,98,0.99,2007-01-25 15:16:50.996577
16052,269,2,678,6.99,2007-01-28 21:44:14.996577
16053,269,2,703,0.99,2007-01-29 00:58:02.996577
16054,269,1,750,4.99,2007-01-29 08:10:06.996577


In [18]:
query = ("""
    select * from customer limit 5;
""")
select_query(query)

customer_id,store_id,first_name,last_name,email,address_id,activebool,create_date,last_update,active
1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,True,2006-02-14,2006-02-15 09:57:20,1
2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,True,2006-02-14,2006-02-15 09:57:20,1
3,1,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,7,True,2006-02-14,2006-02-15 09:57:20,1
4,2,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,8,True,2006-02-14,2006-02-15 09:57:20,1
5,1,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,9,True,2006-02-14,2006-02-15 09:57:20,1


In [19]:
query = ("""
    select * from address limit 5;
""")
select_query(query)

address_id,address,address2,district,city_id,postal_code,phone,last_update
1,47 MySakila Drive,,Alberta,300,,,2006-02-15 09:45:30
2,28 MySQL Boulevard,,QLD,576,,,2006-02-15 09:45:30
3,23 Workhaven Lane,,Alberta,300,,14033335568.0,2006-02-15 09:45:30
4,1411 Lillydale Drive,,QLD,576,,6172235589.0,2006-02-15 09:45:30
5,1913 Hanoi Way,,Nagasaki,463,35200.0,28303384290.0,2006-02-15 09:45:30


In [20]:
query = ("""
    select * from city limit 5;
""")
select_query(query)

city_id,city,country_id,last_update
1,A Corua (La Corua),87,2006-02-15 09:45:25
2,Abha,82,2006-02-15 09:45:25
3,Abu Dhabi,101,2006-02-15 09:45:25
4,Acua,60,2006-02-15 09:45:25
5,Adana,97,2006-02-15 09:45:25


In [21]:
query = ("""
    select ci.city_id, ci.city, cu.customer_id, p.payment_id, p.amount, p.payment_date    
    from city ci
    join address ad on (ci.city_id = ad.city_id)
    join customer cu on (ad.address_id = cu.address_id)
    join payment p on (cu.customer_id = p.customer_id)
    order by p.payment_date desc
    limit 10;
""")
select_query(query)

city_id,city,customer_id,payment_id,amount,payment_date
69,Benin City,284,31925,0.0,2007-05-14 13:44:29.996577
69,Benin City,284,31924,5.98,2007-05-14 13:44:29.996577
446,Salinas,269,31920,0.0,2007-05-14 13:44:29.996577
514,Tabriz,279,31922,4.99,2007-05-14 13:44:29.996577
411,Ponce,282,31923,0.99,2007-05-14 13:44:29.996577
298,Lengshuijiang,267,31917,7.98,2007-05-14 13:44:29.996577
446,Salinas,269,31919,3.98,2007-05-14 13:44:29.996577
298,Lengshuijiang,267,31918,0.0,2007-05-14 13:44:29.996577
263,Karnal,274,31921,0.99,2007-05-14 13:44:29.996577
522,Tambaram,287,31926,0.99,2007-05-14 13:44:29.996577


In [22]:
query = ("""
    select ci.city_id, ci.city, sum(p.amount) as revenue   
    from city ci
    join address ad on (ci.city_id = ad.city_id)
    join customer cu on (ad.address_id = cu.address_id)
    join payment p on (cu.customer_id = p.customer_id)
    group by ci.city_id
    order by revenue desc
    limit 10;
""")
select_query(query)

city_id,city,revenue
101,Cape Coral,221.55
442,Saint-Denis,216.54
42,Aurora,198.5
340,Molodetno,195.58
29,Apeldoorn,194.61
456,Santa Brbara dOeste,194.61
423,Qomsheh,186.62
312,London,180.52
388,Ourense (Orense),177.6
78,Bijapur,175.61


## 3.3 Insight 3: Revenue of a movie by customer city and by month

### We don't have a column for `month` so will drive it using a sql function `EXTRACT` 
### Totoal revenue by month?

In [23]:
query = ("""
    select * from payment limit 5;
""")
select_query(query)

payment_id,customer_id,staff_id,rental_id,amount,payment_date
16050,269,2,7,1.99,2007-01-24 21:40:19.996577
16051,269,1,98,0.99,2007-01-25 15:16:50.996577
16052,269,2,678,6.99,2007-01-28 21:44:14.996577
16053,269,2,703,0.99,2007-01-29 00:58:02.996577
16054,269,1,750,4.99,2007-01-29 08:10:06.996577


In [24]:
query = ("""
    select amount, EXTRACT(month from payment_date) as month 
    from payment limit 10;
""")
select_query(query)

amount,month
1.99,1
0.99,1
6.99,1
0.99,1
4.99,1
2.99,1
1.99,1
4.99,1
8.99,1
0.99,1


### We should expect 4 rows here, because out db have records from Jan to April
- `EXTRACT` output months in order from 1 (Jan) to 12 (Dec)
- It appears that sold mostly in April

In [25]:
query = ("""
    select sum(amount) as revenue, EXTRACT(month from payment_date) as month 
    from payment
    group by month
    order by revenue desc;
""")
select_query(query)

revenue,month
28559.46,4
23886.56,3
9631.88,2
4824.43,1
514.18,5


In [26]:
query = ("""
    select f.film_id, f.title, ci.city_id, ci.city, cu.customer_id, p.payment_id, p.amount, p.payment_date, EXTRACT(month from p.payment_date) as month
    from city ci
    join address ad on (ci.city_id = ad.city_id)
    join customer cu on (ad.address_id = cu.address_id)
    join payment p on (cu.customer_id = p.customer_id)
    join rental r on (p.rental_id = r.rental_id)
    join inventory i on (r.inventory_id = i.inventory_id)
    join film f on (i.film_id = f.film_id)
    limit 10;
""")
select_query(query)

film_id,title,city_id,city,customer_id,payment_id,amount,payment_date,month
663,PATIENT SISTER,463,Sasebo,1,16677,2.99,2007-01-25 09:59:03.996577,1
875,TALENTED HOMICIDE,463,Sasebo,1,16678,0.99,2007-01-28 09:03:49.996577,1
611,MUSKETEERS WAIT,463,Sasebo,1,18495,5.99,2007-02-14 23:22:38.996577,2
228,DETECTIVE VISION,463,Sasebo,1,18496,0.99,2007-02-15 16:31:19.996577,2
308,FERRIS MOTHER,463,Sasebo,1,18497,9.99,2007-02-15 19:37:12.996577,2
159,CLOSER BANG,463,Sasebo,1,18498,4.99,2007-02-16 13:47:23.996577,2
44,ATTACKS HATE,463,Sasebo,1,18499,4.99,2007-02-18 07:10:14.996577,2
766,SAVANNAH TOWN,463,Sasebo,1,18500,0.99,2007-02-18 12:02:25.996577,2
997,YOUTH KICK,463,Sasebo,1,18501,3.99,2007-02-21 04:53:11.996577,2
3,ADAPTATION HOLES,463,Sasebo,1,22680,4.99,2007-03-01 07:19:30.996577,3


In [27]:
query = ("""
    select f.film_id, f.title, ci.city_id, ci.city, EXTRACT(month from p.payment_date) as month, sum(p.amount) as revenue
    from city ci
    join address ad on (ci.city_id = ad.city_id)
    join customer cu on (ad.address_id = cu.address_id)
    join payment p on (cu.customer_id = p.customer_id)
    join rental r on (p.rental_id = r.rental_id)
    join inventory i on (r.inventory_id = i.inventory_id)
    join film f on (i.film_id = f.film_id)
    group by (f.film_id, ci.city_id, month)
    order by month, revenue desc
    limit 10;
""")
select_query(query)

film_id,title,city_id,city,month,revenue
791,SHOW LORD,325,Mannheim,1,11.99
501,KISSING DOLLS,543,Toulon,1,10.99
21,AMERICAN CIRCUS,99,Callao,1,10.99
879,TELEGRAPH VOYAGE,351,Naala-Porto,1,10.99
126,CASUALTIES ENCINO,573,Warren,1,10.99
46,AUTUMN CROW,35,Ashgabat,1,9.99
575,MIDSUMMER GROUNDHOG,560,Vaduz,1,9.99
113,CALIFORNIA BIRDS,409,Plock,1,9.99
216,DAY UNFAITHFUL,61,Baybay,1,9.99
210,DARKO DORADO,74,Bhilwara,1,9.99


- note the how many joins we needed, 6!
- joins are expensive and slow
- that's why we need dwh and fact and dimensions
- the last table shows 3 dimensions and 1 fact (cube)

# Now let's gi with a better schema for analytics, a start schema
![](images/Star.png)

## Creating the dimensions and fact tables

In [66]:
query = ("""
    DROP TABLE IF EXISTS factSales;
    DROP TABLE IF EXISTS dimDate;
    DROP TABLE IF EXISTS dimCustomer;
    DROP TABLE IF EXISTS dimMovie;
    DROP TABLE IF EXISTS dimStore;
""")
other_queries(query)

In [67]:
query = ("""
    create table dimDate
    (
        date_key integer not null primary key,
        date date not null,
        year smallint not null,
        quarter smallint not null,
        month smallint not null,
        week smallint not null,
        day smallint not null,
        is_weekend boolean
    );
""")
other_queries(query)

In [68]:
query = ("""
    create table dimCustomer
    (
        customer_key    serial primary key,
        customer_id     smallint not null,
        first_name      varchar(45) not null,
        last_name       varchar(45) not null,
        email           varchar(45),
        address         varchar(45) not null,
        address2        varchar(45),
        district        varchar(45) not null,
        city            varchar(45) not null,
        country        varchar(45) not null,
        postal_code      varchar(10),
        phone           varchar(20) not null,
        active          smallint not null,
        create_date     timestamp not null,
        start_date      date not null,
        end_date        date not null
    );
""")
other_queries(query)

In [69]:
query = ("""
    create table dimMovie 
    (
        movie_key           serial PRIMARY KEY,
        film_id             smallint NOT NULL,
        title               varchar(255) NOT NULL,
        description         text,
        release_year        year,
        language            varchar(20) NOT NULL,
        original_language   varchar(20),
        rental_duration     smallint NOT NULL,
        length              smallint NOT NULL,
        rating              varchar(5) NOT NULL,
        special_features    varchar(60) NOT NULL
    );
""")
other_queries(query)

In [70]:
query = ("""
    CREATE TABLE dimStore 
    (
        store_key            SERIAL PRIMARY KEY,
        store_id             smallint NOT NULL,
        address              varchar(50) NOT NULL,
        address2             varchar(50),
        district             varchar(20) NOT NULL,
        city                 varchar(50) NOT NULL,
        country              varchar(50) NOT NULL,
        postal_code          varchar(10),
        manager_first_name   varchar(45) NOT NULL,
        manager_last_name    varchar(45) NOT NULL,
        start_date           date NOT NULL,
        end_date             date NOT NULL
    );
""")
other_queries(query)

In [71]:
query = ("""
    CREATE TABLE factSales 
    (
        sales_key     SERIAL PRIMARY KEY,
        date_key      INT NOT NULL REFERENCES dimDate(date_key),
        customer_key  INT NOT NULL REFERENCES dimCustomer(customer_key),
        movie_key     INT NOT NULL REFERENCES dimMovie(movie_key),
        store_key     INT NOT NULL REFERENCES dimStore(store_key),
        sales_amount  decimal(5,2) NOT NULL
    );

""")
other_queries(query)

## Inserting data into dimensions and fact tables

### 1. dimDate table

In [72]:
query = ("""
    select * from payment limit 5;
""")
select_query(query)

payment_id,customer_id,staff_id,rental_id,amount,payment_date
16050,269,2,7,1.99,2007-01-24 21:40:19.996577
16051,269,1,98,0.99,2007-01-25 15:16:50.996577
16052,269,2,678,6.99,2007-01-28 21:44:14.996577
16053,269,2,703,0.99,2007-01-29 00:58:02.996577
16054,269,1,750,4.99,2007-01-29 08:10:06.996577


In [74]:
query = ("""
    select distinct(to_char(payment_date::DATE, 'yyyyMMDD')::integer) as date_key,
           date(payment_date)                                         as date,
           extract(year from payment_date)                            as year,
           extract(quarter from payment_date)                         as quarter,
           extract(month from payment_date)                           as month,
           extract(week from payment_date)                            as week,
           extract(day from payment_date)                             as day,
           case 
               when extract(isodow from payment_date) in (6,7) then true 
               else false
           end
    from payment limit 10;
""")
select_query(query)

date_key,date,year,quarter,month,week,day,case
20070320,2007-03-20,2007,1,3,12,20,False
20070129,2007-01-29,2007,1,1,5,29,False
20070406,2007-04-06,2007,2,4,14,6,False
20070301,2007-03-01,2007,1,3,9,1,False
20070411,2007-04-11,2007,2,4,15,11,False
20070322,2007-03-22,2007,1,3,12,22,False
20070131,2007-01-31,2007,1,1,5,31,False
20070216,2007-02-16,2007,1,2,7,16,False
20070221,2007-02-21,2007,1,2,8,21,False
20070426,2007-04-26,2007,2,4,17,26,False


In [73]:
query = ("""
    insert into dimDate (date_key, date, year, quarter, month, week, day, is_weekend)
    
    select distinct(to_char(payment_date::DATE, 'yyyyMMDD')::integer) as date_key,
           date(payment_date)                                         as date,
           extract(year from payment_date)                            as year,
           extract(quarter from payment_date)                         as quarter,
           extract(month from payment_date)                           as month,
           extract(week from payment_date)                            as week,
           extract(day from payment_date)                             as day,
           case 
               when extract(isodow from payment_date) in (6,7) then true 
               else false
           end
    from payment;
""")
other_queries(query)

```sql
select distinct(to_char(payment_date::DATE, 'yyyyMMDD')::integer) as date_key,
```

This SQL statement extracts a unique list of date keys from the payment_date column by transforming the date into a specific integer format (yyyyMMDD)


1. `to_char(payment_date::DATE, 'yyyyMMDD')`

    This SQL statement extracts a unique list of date keys from the payment_date column by transforming the date 
    into a specific integer format (yyyyMMDD). Here's a breakdown of the line:
    - `to_char(payment_date::DATE, 'yyyyMMDD')`
        payment_date::DATE: This casts the payment_date column to a DATE data type (in case it is stored as a 
        timestamp or other format).
    
    - `to_char(..., 'yyyyMMDD')`: The to_char function formats the date into a string in the format yyyyMMDD.
    
    - For example:
        If payment_date is 2024-12-04, to_char will convert it to '20241204' (a string).
        
<br/>

2. `::integer`

    After converting payment_date to the string '20241204', the ::integer casts the string into an integer value, 
    resulting in 20241204 as an integer.
    
<br/>

3. `select distinct(...)`

    The DISTINCT keyword ensures that the query returns only unique date_key values, eliminating duplicates.
    

<br/>

Final result
```sql
date_key
--------
20241204
20241205
```
<br/>
This is commonly used in data warehouses to standardize dates for easier joining.

In [79]:
query = ("""
    select * from dimDate limit 5;
""")
select_query(query)

date_key,date,year,quarter,month,week,day,is_weekend
20070320,2007-03-20,2007,1,3,12,20,False
20070129,2007-01-29,2007,1,1,5,29,False
20070406,2007-04-06,2007,2,4,14,6,False
20070301,2007-03-01,2007,1,3,9,1,False
20070411,2007-04-11,2007,2,4,15,11,False


### 2. dimCustomer table

In [75]:
query = ("""
    select * from customer limit 5;
""")
select_query(query)

customer_id,store_id,first_name,last_name,email,address_id,activebool,create_date,last_update,active
1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,True,2006-02-14,2006-02-15 09:57:20,1
2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,True,2006-02-14,2006-02-15 09:57:20,1
3,1,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,7,True,2006-02-14,2006-02-15 09:57:20,1
4,2,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,8,True,2006-02-14,2006-02-15 09:57:20,1
5,1,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,9,True,2006-02-14,2006-02-15 09:57:20,1


In [76]:
query = ("""
    select cu.customer_id, cu.first_name, cu.last_name, cu.email, ad.address, ad.address2, ad.district, ci.city, co.country,
           ad.postal_code, ad.phone, cu.active, cu.create_date, now() as start_date, now() as end_date
    from customer cu
    join address ad on (cu.address_id = ad.address_id)
    join city ci on    (ad.city_id = ci.city_id)
    join country co on (ci.country_id = co.country_id)
    limit 5;
""")
select_query(query)

customer_id,first_name,last_name,email,address,address2,district,city,country,postal_code,phone,active,create_date,start_date,end_date
1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1913 Hanoi Way,,Nagasaki,Sasebo,Japan,35200,28303384290,1,2006-02-14,2024-12-04 17:24:37.730735+00:00,2024-12-04 17:24:37.730735+00:00
2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1121 Loja Avenue,,California,San Bernardino,United States,17886,838635286649,1,2006-02-14,2024-12-04 17:24:37.730735+00:00,2024-12-04 17:24:37.730735+00:00
3,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,692 Joliet Street,,Attika,Athenai,Greece,83579,448477190408,1,2006-02-14,2024-12-04 17:24:37.730735+00:00,2024-12-04 17:24:37.730735+00:00
4,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,1566 Inegl Manor,,Mandalay,Myingyan,Myanmar,53561,705814003527,1,2006-02-14,2024-12-04 17:24:37.730735+00:00,2024-12-04 17:24:37.730735+00:00
5,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,53 Idfu Parkway,,Nantou,Nantou,Taiwan,42399,10655648674,1,2006-02-14,2024-12-04 17:24:37.730735+00:00,2024-12-04 17:24:37.730735+00:00


In [77]:
query = ("""
    insert into dimCustomer (customer_key, customer_id, first_name, last_name, email, address, address2, district, 
                             city, country, postal_code, phone, active, create_date, start_date, end_date)
                            
    
    select cu.customer_id as customer_key,
           cu.customer_id,
           cu.first_name as first_name,
           cu.last_name as last_name, 
           cu.email as email, 
           ad.address as address, 
           ad.address2 as address2, 
           ad.district as district, 
           ci.city as city, 
           co.country as country,
           ad.postal_code as postal_code, 
           ad.phone, 
           cu.active, 
           cu.create_date, 
           now() as start_date, 
           now() as end_date
    from customer cu
    join address ad on (cu.address_id = ad.address_id)
    join city ci on    (ad.city_id = ci.city_id)
    join country co on (ci.country_id = co.country_id);
""")
other_queries(query)

In [78]:
query = ("""
    select * from dimCustomer limit 5;
""")
select_query(query)

customer_key,customer_id,first_name,last_name,email,address,address2,district,city,country,postal_code,phone,active,create_date,start_date,end_date
1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1913 Hanoi Way,,Nagasaki,Sasebo,Japan,35200,28303384290,1,2006-02-14 00:00:00,2024-12-04,2024-12-04
2,2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1121 Loja Avenue,,California,San Bernardino,United States,17886,838635286649,1,2006-02-14 00:00:00,2024-12-04,2024-12-04
3,3,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,692 Joliet Street,,Attika,Athenai,Greece,83579,448477190408,1,2006-02-14 00:00:00,2024-12-04,2024-12-04
4,4,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,1566 Inegl Manor,,Mandalay,Myingyan,Myanmar,53561,705814003527,1,2006-02-14 00:00:00,2024-12-04,2024-12-04
5,5,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,53 Idfu Parkway,,Nantou,Nantou,Taiwan,42399,10655648674,1,2006-02-14 00:00:00,2024-12-04,2024-12-04


### 3. dimMovies table

In [81]:
query = ("""
    select * from film limit 2;
""")
select_query(query)

film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies,2006,1,,6,0.99,86,20.99,PG,2007-09-10 17:46:03.905795,"['Deleted Scenes', 'Behind the Scenes']",'academi':1 'battl':15 'canadian':20 'dinosaur':2 'drama':5 'epic':4 'feminist':8 'mad':11 'must':14 'rocki':21 'scientist':12 'teacher':17
2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China,2006,1,,3,4.99,48,12.99,G,2007-09-10 17:46:03.905795,"['Trailers', 'Deleted Scenes']",'ace':1 'administr':9 'ancient':19 'astound':4 'car':17 'china':20 'databas':8 'epistl':5 'explor':12 'find':15 'goldfing':2 'must':14


In [82]:
query = ("""
    select * from language limit 2;
""")
select_query(query)

language_id,name,last_update
1,English,2006-02-15 10:02:19
2,Italian,2006-02-15 10:02:19


In [86]:
query = ("""
    insert into dimMovie (movie_key, film_id, title, description, release_year, language, original_language, 
                             rental_duration, length, rating, special_features)
                            
    
    select f.film_id as movie_key,
           f.film_id,
           f.title,
           f.description,
           f.release_year,
           l.name as language,
           orig_lang.name as original_language,
           f.rental_duration, 
           f.length,
           f.rating,
           f.special_features
    from film f
    join language l on (f.language_id = l.language_id)
    left join language orig_lang on (f.language_id = orig_lang.language_id);
""")
other_queries(query)

```sql
left join language orig_lang on (f.language_id = orig_lang.language_id);
```

`orig_lang.name AS original_language`: The original language, using a LEFT JOIN because not all films may have an original_language_id, we did a left join to ensures that even if a film does not have an original_language_id, that film will still appear in the results.

In [87]:
query = ("""
    select * from dimMovie limit 2;
""")
select_query(query)

movie_key,film_id,title,description,release_year,language,original_language,rental_duration,length,rating,special_features
1,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies,2006,English,English,6,86,PG,"{""Deleted Scenes"",""Behind the Scenes""}"
2,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China,2006,English,English,3,48,G,"{Trailers,""Deleted Scenes""}"


### 4. dimStore table

In [88]:
query = ("""
    select * from store limit 5;
""")
select_query(query)

store_id,manager_staff_id,address_id,last_update
1,1,1,2006-02-15 09:57:12
2,2,2,2006-02-15 09:57:12


In [91]:
query = ("""
    select * from city limit 2;
""")
select_query(query)

city_id,city,country_id,last_update
1,A Corua (La Corua),87,2006-02-15 09:45:25
2,Abha,82,2006-02-15 09:45:25


In [103]:
query = ("""
    select * from country limit 2;
""")
select_query(query)

country_id,country,last_update
1,Afghanistan,2006-02-15 09:44:00
2,Algeria,2006-02-15 09:44:00


In [106]:
# picture column has the memory location of a corsponding picture, it's type can't be viewed with
# prettytable libray to display the table, so i changed the column type
query = ("""
    ALTER TABLE staff
    ALTER COLUMN picture TYPE VARCHAR(25); 
""")
other_queries(query)

In [107]:
query = ("""
    select * from staff limit 2;
""")
select_query(query)

staff_id,first_name,last_name,address_id,email,store_id,active,username,password,last_update,picture
1,Mike,Hillyer,3,Mike.Hillyer@sakilastaff.com,1,True,Mike,8cb2237d0679ca88db6464eac60da96345513964,2006-05-16 16:13:11.793280,\x89504e470d0a5a0a
2,Jon,Stephens,4,Jon.Stephens@sakilastaff.com,2,True,Jon,8cb2237d0679ca88db6464eac60da96345513964,2006-05-16 16:13:11.793280,


In [111]:
query = ("""
    insert into dimStore (store_key, store_id, address, address2, district, city, country, 
                             postal_code, manager_first_name, manager_last_name, start_date, end_date)
                            
    
    select s.store_id as store_key,
           s.store_id,
           ad.address,
           ad.address2,
           ad.district,
           ci.city,
           co.country,
           ad.postal_code,
           st.first_name,
           st.last_name,
           now() as start_date,
           now() as end_date
           
    from store s
    join staff st on (s.manager_staff_id = st.staff_id)
    join address ad on (s.address_id = ad.address_id)
    join city ci on (ad.city_id = ci.city_id)
    join country co on (ci.country_id = co.country_id);
""")
other_queries(query)

In [112]:
query = ("""
    select * from dimStore limit 5;
""")
select_query(query)

store_key,store_id,address,address2,district,city,country,postal_code,manager_first_name,manager_last_name,start_date,end_date
1,1,47 MySakila Drive,,Alberta,Lethbridge,Canada,,Mike,Hillyer,2024-12-04,2024-12-04
2,2,28 MySQL Boulevard,,QLD,Woodridge,Australia,,Jon,Stephens,2024-12-04,2024-12-04


### 5. factSales table

In [113]:
query = ("""
    select * from payment limit 2;
""")
select_query(query)

payment_id,customer_id,staff_id,rental_id,amount,payment_date
16050,269,2,7,1.99,2007-01-24 21:40:19.996577
16051,269,1,98,0.99,2007-01-25 15:16:50.996577


In [114]:
query = ("""
    select * from rental limit 2;
""")
select_query(query)

rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-16 02:30:53
3,2005-05-24 23:03:39,1711,408,2005-06-01 22:12:39,1,2006-02-16 02:30:53


In [115]:
query = ("""
    select * from inventory limit 2;
""")
select_query(query)

inventory_id,film_id,store_id,last_update
1,1,1,2006-02-15 10:09:17
2,1,1,2006-02-15 10:09:17


In [118]:
query = ("""
    insert into factSales (date_key, customer_key, movie_key, store_key, sales_amount)
                            
    
    select to_char(p.payment_date::DATE, 'yyyyMMDD')::integer as date_key,
           p.customer_id,
           i.film_id,
           i.store_id,
           p.amount
           
    from payment p
    join rental r on (p.rental_id = r.rental_id)
    join inventory i on (r.inventory_id = i.inventory_id);
""")
other_queries(query)

In [120]:
query = ("""
    select * from factSales limit 5;
""")
select_query(query)

sales_key,date_key,customer_key,movie_key,store_key,sales_amount
1,20070124,269,870,2,1.99
2,20070125,269,651,1,0.99
3,20070128,269,818,1,6.99
4,20070129,269,249,2,0.99
5,20070129,269,159,2,4.99


# Lets repeat step 3 and do some simple data analsyis and spot how it's esaier

## Insight1: Top Grossing Movies

In [124]:
query = ("""
    select movie_key, sum(sales_amount) as revenue
    from factSales 
    group by movie_key
    order by revenue desc
    limit 10;
""")
select_query(query)

movie_key,revenue
879,231.73
973,223.69
1000,214.69
369,209.69
764,204.72
893,201.71
897,198.72
403,195.7
460,191.74
444,190.78


### If we want to add the title, just join with dimMovie table
and indeed we get the same result as before but mush easier and faster

In [127]:
query = ("""
    select dm.movie_key, dm.title, sum(fs.sales_amount) as revenue
    from factSales fs
    join dimMovie dm on (fs.movie_key = dm.movie_key)
    group by dm.movie_key
    order by revenue desc
    limit 10;
""")
select_query(query)

movie_key,title,revenue
879,TELEGRAPH VOYAGE,231.73
973,WIFE TURN,223.69
1000,ZORRO ARK,214.69
369,GOODFELLAS SALUTE,209.69
764,SATURDAY LAMBS,204.72
893,TITANS JERK,201.71
897,TORQUE BOUND,198.72
403,HARRY IDAHO,195.7
460,INNOCENT USUAL,191.74
444,HUSTLER PARTY,190.78


## Insight2: Top Grossing Cities
from which cities our customers pay most

In [128]:
query = ("""
    select * from factSales limit 5;
""")
select_query(query)

sales_key,date_key,customer_key,movie_key,store_key,sales_amount
1,20070124,269,870,2,1.99
2,20070125,269,651,1,0.99
3,20070128,269,818,1,6.99
4,20070129,269,249,2,0.99
5,20070129,269,159,2,4.99


In [135]:
query = ("""
    select * from dimCustomer limit 2;
""")
select_query(query)

customer_key,customer_id,first_name,last_name,email,address,address2,district,city,country,postal_code,phone,active,create_date,start_date,end_date
1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1913 Hanoi Way,,Nagasaki,Sasebo,Japan,35200,28303384290,1,2006-02-14 00:00:00,2024-12-04,2024-12-04
2,2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1121 Loja Avenue,,California,San Bernardino,United States,17886,838635286649,1,2006-02-14 00:00:00,2024-12-04,2024-12-04


In [131]:
query = ("""
    select fs.sales_key, fs.sales_amount, dc.customer_key, dc.city 
    from factSales fs
    join dimCustomer dc on (fs.customer_key = dc.customer_key)
    limit 5;
""")
select_query(query)

sales_key,sales_amount,customer_key,city
1,1.99,269,Salinas
2,0.99,269,Salinas
3,6.99,269,Salinas
4,0.99,269,Salinas
5,4.99,269,Salinas


In [132]:
query = ("""
    select dc.city , sum(fs.sales_amount) as revenue
    from factSales fs
    join dimCustomer dc on (fs.customer_key = dc.customer_key)
    group by dc.city
    order by revenue desc
    limit 10;
""")
select_query(query)

city,revenue
Cape Coral,221.55
Saint-Denis,216.54
Aurora,198.5
Molodetno,195.58
Santa Brbara dOeste,194.61
Apeldoorn,194.61
Qomsheh,186.62
London,180.52
Ourense (Orense),177.6
Bijapur,175.61


## Insight3: Revenue of movies by customer, city and by month


In [133]:
query = ("""
    select * from factSales limit 5;
""")
select_query(query)

sales_key,date_key,customer_key,movie_key,store_key,sales_amount
1,20070124,269,870,2,1.99
2,20070125,269,651,1,0.99
3,20070128,269,818,1,6.99
4,20070129,269,249,2,0.99
5,20070129,269,159,2,4.99


In [136]:
query = ("""
    select * from dimCustomer limit 2;
""")
select_query(query)

customer_key,customer_id,first_name,last_name,email,address,address2,district,city,country,postal_code,phone,active,create_date,start_date,end_date
1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1913 Hanoi Way,,Nagasaki,Sasebo,Japan,35200,28303384290,1,2006-02-14 00:00:00,2024-12-04,2024-12-04
2,2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1121 Loja Avenue,,California,San Bernardino,United States,17886,838635286649,1,2006-02-14 00:00:00,2024-12-04,2024-12-04


In [137]:
query = ("""
    select * from dimDate limit 2;
""")
select_query(query)

date_key,date,year,quarter,month,week,day,is_weekend
20070320,2007-03-20,2007,1,3,12,20,False
20070129,2007-01-29,2007,1,1,5,29,False


In [138]:
query = ("""
    select * from dimMovie limit 2;
""")
select_query(query)

movie_key,film_id,title,description,release_year,language,original_language,rental_duration,length,rating,special_features
1,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies,2006,English,English,6,86,PG,"{""Deleted Scenes"",""Behind the Scenes""}"
2,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China,2006,English,English,3,48,G,"{Trailers,""Deleted Scenes""}"


In [141]:
query = ("""
    select dm.title, dc.city, dd.month, fs.sales_amount 
    from factSales fs
    join dimCustomer dc on (fs.customer_key = dc.customer_key) 
    join dimMovie dm on (fs.movie_key = dm.movie_key)
    join dimDate dd on (fs.date_key = dd.date_key)
    limit 5;
""")
select_query(query)

title,city,month,sales_amount
SWARM GOLD,Salinas,1,1.99
PACKER MADIGAN,Salinas,1,0.99
SOMETHING DUCK,Salinas,1,6.99
DRACULA CRYSTAL,Salinas,1,0.99
CLOSER BANG,Salinas,1,4.99


In [143]:
query = ("""
    select dm.title, dc.city, dd.month, sum(fs.sales_amount) as revenue
    from factSales fs
    join dimCustomer dc on (fs.customer_key = dc.customer_key) 
    join dimMovie dm on (fs.movie_key = dm.movie_key)
    join dimDate dd on (fs.date_key = dd.date_key)
    group by (dm.title, dc.city, dd.month)
    order by month, revenue desc
    limit 10;
""")
select_query(query)

title,city,month,revenue
SHOW LORD,Mannheim,1,11.99
KISSING DOLLS,Toulon,1,10.99
TELEGRAPH VOYAGE,Naala-Porto,1,10.99
CASUALTIES ENCINO,Warren,1,10.99
AMERICAN CIRCUS,Callao,1,10.99
MIDSUMMER GROUNDHOG,Vaduz,1,9.99
MILLION ACE,Bergamo,1,9.99
HEAD STRANGER,Xiangtan,1,9.99
DARKO DORADO,Bhilwara,1,9.99
STRANGER STRANGERS,Ipoh,1,9.99
