In [18]:
import psycopg2
from dotenv import load_dotenv
import os
import pandas as pd
from prettytable import PrettyTable

# Step 1 : Connect to db

In [2]:
%load_ext dotenv
%dotenv

In [3]:
connection = psycopg2.connect(
        host=os.getenv('DATABASE_HOST'),
        port=os.getenv('DATABASE_PORT'),
        user=os.getenv('DATABASE_USER'),
        password=os.getenv('DATABASE_PASSWORD'),
        dbname=os.getenv('DATABASE_NAME'),
    )
connection.autocommit = True
cursor = connection.cursor()

In [22]:
def select_query(query):
    cursor.execute(query)
    
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]

    # format the output nicely
    table = PrettyTable()
    table.field_names = columns
    for row in rows:
        table.add_row(row)
                      
    return table

# Step 2 : Invistigate db

In [23]:
query = ("""
    SELECT table_schema, table_name
    FROM information_schema.tables
    WHERE table_type = 'BASE TABLE' AND table_schema NOT IN ('pg_catalog', 'information_schema')
    ORDER BY table_schema, table_name;
""")
select_query(query)

table_schema,table_name
public,actor
public,address
public,category
public,city
public,country
public,customer
public,film
public,film_actor
public,film_category
public,inventory


![](images/3NF.png)

# What data sizes are we locking at?

In [24]:
nStores = ("""
    select count(*) from store;
""")
nStores = select_query(nStores)

nFilms = ("""
    select count(*) from film;
""")
nFilms = select_query(nFilms)

nCustomers = ("""
    select count(*) from customer;
""")
nCustomers = select_query(nCustomers)

nRentals = ("""
    select count(*) from rental;
""")
nRentals = select_query(nRentals)

nPayments = ("""
    select count(*) from payment;
""")
nPayments = select_query(nPayments)

nStaff = ("""
    select count(*) from staff;
""")
nStaff = select_query(nStaff)

nCity = ("""
    select count(*) from city;
""")
nCity = select_query(nCity)

nCountries = ("""
    select count(*) from country;
""")
nCountries = select_query(nCountries)


print("nFilms = ", nFilms[0][0])
print("nCustomers = ", nCustomers[0][0])
print("nRentals = ", nRentals[0][0])
print("nPayments = ", nPayments[0][0])
print("nStaff = ", nStaff[0][0])
print("nStores = ", nStores[0][0])
print("nCity = ", nCity[0][0])
print("nCountries = ", nCountries[0][0])

nFilms =  +-------+
| count |
+-------+
|  1000 |
+-------+
nCustomers =  +-------+
| count |
+-------+
|  599  |
+-------+
nRentals =  +-------+
| count |
+-------+
| 16044 |
+-------+
nPayments =  +-------+
| count |
+-------+
| 16049 |
+-------+
nStaff =  +-------+
| count |
+-------+
|   2   |
+-------+
nStores =  +-------+
| count |
+-------+
|   2   |
+-------+
nCity =  +-------+
| count |
+-------+
|  600  |
+-------+
nCountries =  +-------+
| count |
+-------+
|  109  |
+-------+


# when? what time period are we talking about?

In [30]:
when = ("""
    select * from payment limit 5;
""")
select_query(when)

payment_id,customer_id,staff_id,rental_id,amount,payment_date
16050,269,2,7,1.99,2007-01-24 21:40:19.996577
16051,269,1,98,0.99,2007-01-25 15:16:50.996577
16052,269,2,678,6.99,2007-01-28 21:44:14.996577
16053,269,2,703,0.99,2007-01-29 00:58:02.996577
16054,269,1,750,4.99,2007-01-29 08:10:06.996577


In [31]:
when = ("""
    select min(payment_date) as start, max(payment_date) as end from payment;
""")
select_query(when)

start,end
2007-01-24 21:21:56.996577,2007-05-14 13:44:29.996577


# Where? where do events in this db occure?
How many events in each district ?

In [34]:
where = ("""
    select * from address limit 5;
""")
select_query(where)

address_id,address,address2,district,city_id,postal_code,phone,last_update
1,47 MySakila Drive,,Alberta,300,,,2006-02-15 09:45:30
2,28 MySQL Boulevard,,QLD,576,,,2006-02-15 09:45:30
3,23 Workhaven Lane,,Alberta,300,,14033335568.0,2006-02-15 09:45:30
4,1411 Lillydale Drive,,QLD,576,,6172235589.0,2006-02-15 09:45:30
5,1913 Hanoi Way,,Nagasaki,463,35200.0,28303384290.0,2006-02-15 09:45:30


In [38]:
where = ("""
    select district, sum(city_id) as n
    from address
    group by district
    order by n desc
    limit 10;
""")
select_query(where)

district,n
Shandong,3237
England,2974
So Paulo,2952
West Bengali,2623
Buenos Aires,2572
Uttar Pradesh,2462
California,2444
Southern Tagalog,1931
Tamil Nadu,1807
Hubei,1790


# Step 3 : Perform some simple data analsyis

## 3.1 Insight 1: Top Grossing Movies
- Payments amounts are in table `payment`
- Movies are in table `film`
- They are not directly linked, `payment` refers to a `rental`, `rental` refers to an `inventory` and `inventory` refers to a `film`
- `payment` -> `rental` -> `inventory` -> `film`

In [42]:
query = ("""
    select film_id, title, release_year, rental_rate, rating from film limit 5;
""")
select_query(query)

film_id,title,release_year,rental_rate,rating
1,ACADEMY DINOSAUR,2006,0.99,PG
2,ACE GOLDFINGER,2006,4.99,G
3,ADAPTATION HOLES,2006,2.99,NC-17
4,AFFAIR PREJUDICE,2006,2.99,G
5,AFRICAN EGG,2006,2.99,G


In [43]:
query = ("""
    select * from inventory limit 5;
""")
select_query(query)

inventory_id,film_id,store_id,last_update
1,1,1,2006-02-15 10:09:17
2,1,1,2006-02-15 10:09:17
3,1,1,2006-02-15 10:09:17
4,1,1,2006-02-15 10:09:17
5,1,2,2006-02-15 10:09:17


In [44]:
query = ("""
    select * from rental limit 5;
""")
select_query(query)

rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-16 02:30:53
3,2005-05-24 23:03:39,1711,408,2005-06-01 22:12:39,1,2006-02-16 02:30:53
4,2005-05-24 23:04:41,2452,333,2005-06-03 01:43:41,2,2006-02-16 02:30:53
5,2005-05-24 23:05:21,2079,222,2005-06-02 04:33:21,1,2006-02-16 02:30:53
6,2005-05-24 23:08:07,2792,549,2005-05-27 01:32:07,1,2006-02-16 02:30:53


In [45]:
query = ("""
    select * from payment limit 5;
""")
select_query(query)

payment_id,customer_id,staff_id,rental_id,amount,payment_date
16050,269,2,7,1.99,2007-01-24 21:40:19.996577
16051,269,1,98,0.99,2007-01-25 15:16:50.996577
16052,269,2,678,6.99,2007-01-28 21:44:14.996577
16053,269,2,703,0.99,2007-01-29 00:58:02.996577
16054,269,1,750,4.99,2007-01-29 08:10:06.996577


### Get movies for every payment

In [48]:
query = ("""
    select f.film_id, f.title, p.payment_id, p.rental_id, i.inventory_id, p.customer_id, p.amount, p.payment_date
    from payment p
    join rental r on (p.rental_id = r.rental_id)
    join inventory i on (r.inventory_id = i.inventory_id)
    join film f on (i.film_id = f.film_id)
    limit 10;
""")
select_query(query)

film_id,title,payment_id,rental_id,inventory_id,customer_id,amount,payment_date
870,SWARM GOLD,16050,7,3995,269,1.99,2007-01-24 21:40:19.996577
651,PACKER MADIGAN,16051,98,2970,269,0.99,2007-01-25 15:16:50.996577
818,SOMETHING DUCK,16052,678,3741,269,6.99,2007-01-28 21:44:14.996577
249,DRACULA CRYSTAL,16053,703,1123,269,0.99,2007-01-29 00:58:02.996577
159,CLOSER BANG,16054,750,730,269,4.99,2007-01-29 08:10:06.996577
205,DANCES NONE,16055,1099,924,269,2.99,2007-01-31 12:23:14.996577
851,STRAIGHT HOURS,16056,193,3900,270,1.99,2007-01-26 05:10:14.996577
559,MARRIED GO,16057,1040,2546,270,4.99,2007-01-31 04:03:42.996577
237,DIVORCE SHINING,16058,1096,1066,271,8.99,2007-01-31 11:59:15.996577
367,GOLDMINE TYCOON,16059,33,1681,272,0.99,2007-01-25 02:47:17.996577


### Sum movies revenue

In [52]:
# Top 10 movies
query = ("""
    select f.film_id, f.title, sum(p.amount) as revenue
    from payment p
    join rental r on (p.rental_id = r.rental_id)
    join inventory i on (r.inventory_id = i.inventory_id)
    join film f on (i.film_id = f.film_id)
    group by f.film_id
    order by revenue desc
    limit 10;
""")
select_query(query)

film_id,title,revenue
879,TELEGRAPH VOYAGE,231.73
973,WIFE TURN,223.69
1000,ZORRO ARK,214.69
369,GOODFELLAS SALUTE,209.69
764,SATURDAY LAMBS,204.72
893,TITANS JERK,201.71
897,TORQUE BOUND,198.72
403,HARRY IDAHO,195.7
460,INNOCENT USUAL,191.74
444,HUSTLER PARTY,190.78
