In [18]:
import psycopg2
from dotenv import load_dotenv
import os
import pandas as pd
from prettytable import PrettyTable

# Step 1 : Connect to db

In [2]:
%load_ext dotenv
%dotenv

In [3]:
connection = psycopg2.connect(
        host=os.getenv('DATABASE_HOST'),
        port=os.getenv('DATABASE_PORT'),
        user=os.getenv('DATABASE_USER'),
        password=os.getenv('DATABASE_PASSWORD'),
        dbname=os.getenv('DATABASE_NAME'),
    )
connection.autocommit = True
cursor = connection.cursor()

In [22]:
def select_query(query):
    cursor.execute(query)
    
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]

    # format the output nicely
    table = PrettyTable()
    table.field_names = columns
    for row in rows:
        table.add_row(row)
                      
    return table

# Step 2 : Invistigate db

In [23]:
query = ("""
    SELECT table_schema, table_name
    FROM information_schema.tables
    WHERE table_type = 'BASE TABLE' AND table_schema NOT IN ('pg_catalog', 'information_schema')
    ORDER BY table_schema, table_name;
""")
select_query(query)

table_schema,table_name
public,actor
public,address
public,category
public,city
public,country
public,customer
public,film
public,film_actor
public,film_category
public,inventory


![](images/3NF.png)

# What data sizes are we locking at?

In [24]:
nStores = ("""
    select count(*) from store;
""")
nStores = select_query(nStores)

nFilms = ("""
    select count(*) from film;
""")
nFilms = select_query(nFilms)

nCustomers = ("""
    select count(*) from customer;
""")
nCustomers = select_query(nCustomers)

nRentals = ("""
    select count(*) from rental;
""")
nRentals = select_query(nRentals)

nPayments = ("""
    select count(*) from payment;
""")
nPayments = select_query(nPayments)

nStaff = ("""
    select count(*) from staff;
""")
nStaff = select_query(nStaff)

nCity = ("""
    select count(*) from city;
""")
nCity = select_query(nCity)

nCountries = ("""
    select count(*) from country;
""")
nCountries = select_query(nCountries)


print("nFilms = ", nFilms[0][0])
print("nCustomers = ", nCustomers[0][0])
print("nRentals = ", nRentals[0][0])
print("nPayments = ", nPayments[0][0])
print("nStaff = ", nStaff[0][0])
print("nStores = ", nStores[0][0])
print("nCity = ", nCity[0][0])
print("nCountries = ", nCountries[0][0])

nFilms =  +-------+
| count |
+-------+
|  1000 |
+-------+
nCustomers =  +-------+
| count |
+-------+
|  599  |
+-------+
nRentals =  +-------+
| count |
+-------+
| 16044 |
+-------+
nPayments =  +-------+
| count |
+-------+
| 16049 |
+-------+
nStaff =  +-------+
| count |
+-------+
|   2   |
+-------+
nStores =  +-------+
| count |
+-------+
|   2   |
+-------+
nCity =  +-------+
| count |
+-------+
|  600  |
+-------+
nCountries =  +-------+
| count |
+-------+
|  109  |
+-------+


# when? what time period are we talking about?

In [30]:
when = ("""
    select * from payment limit 5;
""")
select_query(when)

payment_id,customer_id,staff_id,rental_id,amount,payment_date
16050,269,2,7,1.99,2007-01-24 21:40:19.996577
16051,269,1,98,0.99,2007-01-25 15:16:50.996577
16052,269,2,678,6.99,2007-01-28 21:44:14.996577
16053,269,2,703,0.99,2007-01-29 00:58:02.996577
16054,269,1,750,4.99,2007-01-29 08:10:06.996577


In [31]:
when = ("""
    select min(payment_date) as start, max(payment_date) as end from payment;
""")
select_query(when)

start,end
2007-01-24 21:21:56.996577,2007-05-14 13:44:29.996577


# Where? where do events in this db occure?
How many events in each district ?

In [34]:
where = ("""
    select * from address limit 5;
""")
select_query(where)

address_id,address,address2,district,city_id,postal_code,phone,last_update
1,47 MySakila Drive,,Alberta,300,,,2006-02-15 09:45:30
2,28 MySQL Boulevard,,QLD,576,,,2006-02-15 09:45:30
3,23 Workhaven Lane,,Alberta,300,,14033335568.0,2006-02-15 09:45:30
4,1411 Lillydale Drive,,QLD,576,,6172235589.0,2006-02-15 09:45:30
5,1913 Hanoi Way,,Nagasaki,463,35200.0,28303384290.0,2006-02-15 09:45:30


where = ("""
    select district, sum(city_id) as n
    from address
    group by district
    order by n desc
    limit 10;
""")
select_query(where)