# Python script for SQLite database
This script creates a SQLite database, loads csv files and performs queries.

### 1. Connect to database

In [1]:
# Libraries
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# Connect to database
db = create_engine( 'sqlite:///db_olist_sqlite' )
conn = db.connect()

### 2. Loading dataset

In [4]:
# Dataset customers
### Reading csv
df_customers = pd.read_csv( 'data/olist_customers_dataset.csv' )
### creating schema
schema = '''
    CREATE TABLE IF NOT EXISTS customers(
        customer_id                 TEXT,
        customer_unique_id          TEXT,
        customer_zip_code_prefix    INTEGER,
        customer_city               TEXT,
        customer_state              TEXT
)
'''
conn.execute( schema )
### inserting data
df_customers.to_sql( 'customers', con=conn, if_exists='append', index=False )

In [5]:
# Dataset geolocation
### Reading csv
df_geolocation = pd.read_csv( 'data/olist_geolocation_dataset.csv' )
### Schema
schema = '''
    CREATE TABLE IF NOT EXISTS geolocation(
        geolocation_zip_code_prefix    INTEGER,
        geolocation_lat                REAL,
        geolocation_lng                REAL,
        geolocation_city               TEXT,
        geolocation_state              TEXT
    
)
'''
conn.execute( schema )
### inserting data
df_geolocation.to_sql( 'geolocation', conn, if_exists='append', index=False )

In [6]:
# Dataset order_items
### Reading csv
df_order_items = pd.read_csv( 'data/olist_order_items_dataset.csv' )
### Schema
schema = '''
    CREATE TABLE IF NOT EXISTS order_items(
        order_id               TEXT,
        order_item_id          INTEGER,
        product_id             TEXT,
        seller_id              TEXT,
        shipping_limit_date    TEXT,
        price                  REAL,
        freight_value          REAL
    
)
'''
conn.execute( schema )
### inserting data
df_order_items.to_sql( 'order_items', conn, if_exists='append', index=False )

In [7]:
# Dataset order_payments
### Reading csv
df_order_payments = pd.read_csv( 'data/olist_order_payments_dataset.csv' )
### Schema
schema = '''
    CREATE TABLE IF NOT EXISTS order_payments(
        order_id                TEXT,
        payment_sequential      INTEGER,
        payment_type            TEXT,
        payment_installments    INTEGER,
        payment_value           REAL
            
)
'''
conn.execute( schema )
### inserting data
df_order_payments.to_sql( 'order_payments', conn, if_exists='append', index=False )

In [8]:
# Dataset order_reviews
### Reading csv
df_order_reviews = pd.read_csv( 'data/olist_order_reviews_dataset.csv' )
### Schema
schema = '''
    CREATE TABLE IF NOT EXISTS order_reviews(
        review_id                  TEXT,
        order_id                   TEXT,
        review_score               INTEGER,
        review_comment_title       TEXT,
        review_comment_message     TEXT,
        review_creation_date       TEXT,
        review_answer_timestamp    TEXT
)
'''
conn.execute( schema )
### inserting data
df_order_reviews.to_sql( 'order_reviews', conn, if_exists='append', index=False )

In [9]:
# Dataset orders
### Reading csv
df_orders = pd.read_csv( 'data/olist_orders_dataset.csv' )
### Schema
schema = '''
    CREATE TABLE IF NOT EXISTS orders(
        order_id                         TEXT,
        customer_id                      TEXT,
        order_status                     TEXT,
        order_purchase_timestamp         TEXT,
        order_approved_at                TEXT,
        order_delivered_carrier_date     TEXT,
        order_delivered_customer_date    TEXT,
        order_estimated_delivery_date    TEXT
)
'''
conn.execute( schema )
### inserting data
df_orders.to_sql( 'orders', conn, if_exists='append', index=False )

In [10]:
# Dataset products
### Reading csv
df_products = pd.read_csv( 'data/olist_products_dataset.csv' )
### Schema
schema = '''
    CREATE TABLE IF NOT EXISTS products(
        product_id                    TEXT,
        product_category_name         TEXT,
        product_name_lenght           REAL,
        product_description_lenght    REAL,
        product_photos_qty            REAL,
        product_weight_g              REAL,
        product_length_cm             REAL,
        product_height_cm             REAL,
        product_width_cm              REAL
)
'''
conn.execute( schema )
### inserting data
df_products.to_sql( 'products', conn, if_exists='append', index=False )

In [11]:
# Dataset sellers
### Reading csv
df_sellers = pd.read_csv( 'data/olist_sellers_dataset.csv' )
### Schema
schema = '''
    CREATE TABLE IF NOT EXISTS sellers(
        seller_id                 TEXT,
        seller_zip_code_prefix    INTEGER,
        seller_city               TEXT,
        seller_state              TEXT
)
'''
conn.execute( schema )
### inserting data
df_sellers.to_sql( 'sellers', conn, if_exists='append', index=False )

In [12]:
# Dataset product_category_name_translation
### Reading csv
df_product_category_name_translation = pd.read_csv( 'data/product_category_name_translation.csv' )
### Schema
schema = '''
    CREATE TABLE IF NOT EXISTS product_category_name_translation(
        product_category_name            TEXT,
        product_category_name_english    TEXT
)
'''
conn.execute( schema )
### inserting data
df_product_category_name_translation.to_sql( 'product_category_name_translation', conn, if_exists='append', index=False )

### 3. Check database

In [13]:
# Check database
query = '''
    SELECT *
    FROM sqlite_master
    WHERE type = 'table'
'''
table = pd.read_sql_query( query, conn )
table

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,customers,customers,2,CREATE TABLE customers(\n customer_id ...
1,table,geolocation,geolocation,2237,CREATE TABLE geolocation(\n geolocation...
2,table,order_items,order_items,2236,CREATE TABLE order_items(\n order_id ...
3,table,order_payments,order_payments,16870,CREATE TABLE order_payments(\n order_id...
4,table,order_reviews,order_reviews,18462,CREATE TABLE order_reviews(\n review_id...
5,table,orders,orders,22128,CREATE TABLE orders(\n order_id ...
6,table,products,products,26660,CREATE TABLE products(\n product_id ...
7,table,sellers,sellers,27237,CREATE TABLE sellers(\n seller_id ...
8,table,product_category_name_translation,product_category_name_translation,27282,CREATE TABLE product_category_name_translation...


### 4. Queries

In [14]:
query = '''
    SELECT 
        p.product_id, 
        p.product_category_name,
        p.product_length_cm,
        p.product_height_cm, 
        p.product_width_cm,
        p.product_length_cm * p.product_height_cm * p.product_width_cm AS volume,
        CASE WHEN p.product_category_name = 'perfumaria' THEN 'physical store' ELSE 'virtual store' END AS store_type,
        CASE WHEN p.product_height_cm < 10 THEN 'small'
             WHEN p.product_height_cm >= 10 and p.product_height_cm < 15 THEN 'medium'
             WHEN p.product_height_cm >= 15 and p.product_height_cm < 20 THEN 'large' 
             ELSE 'extra_large' END AS size        
    FROM 
        products p
    WHERE
        ( p.product_category_name = 'perfumaria' or p.product_category_name = 'artes' )
        and size = 'small'
    ORDER BY
        volume DESC
'''
table = pd.read_sql_query( query, conn )
table

Unnamed: 0,product_id,product_category_name,product_length_cm,product_height_cm,product_width_cm,volume,store_type,size
0,058755c380722a806fc5440093c2c5bf,artes,98.0,6.0,44.0,25872.0,virtual store,small
1,058755c380722a806fc5440093c2c5bf,artes,98.0,6.0,44.0,25872.0,virtual store,small
2,b0694770dbbfebe2d48cd288cfce2f85,artes,100.0,4.0,60.0,24000.0,virtual store,small
3,b0694770dbbfebe2d48cd288cfce2f85,artes,100.0,4.0,60.0,24000.0,virtual store,small
4,4fe644d766c7566dbc46fb851363cb3b,artes,65.0,6.0,50.0,19500.0,virtual store,small
...,...,...,...,...,...,...,...,...
525,c6f24451ab94c30ea2e17eb0d9ea5d9c,perfumaria,16.0,2.0,11.0,352.0,physical store,small
526,f9bab7e7dbab7e5883add425280ec006,perfumaria,16.0,2.0,11.0,352.0,physical store,small
527,5a42a6cbafa85310c1b2b0fa7dc6a8f6,perfumaria,16.0,2.0,11.0,352.0,physical store,small
528,a7e3c4e94e2cdb8acd51459762d4c263,perfumaria,16.0,2.0,11.0,352.0,physical store,small


In [15]:
query = '''
    SELECT
        p.product_category_name,
        count(p.product_id) AS num_products,
        CASE WHEN p.product_height_cm < 10 THEN 'small'
             WHEN p.product_height_cm >= 10 and p.product_height_cm < 15 THEN 'medium'
             WHEN p.product_height_cm >= 15 and p.product_height_cm < 20 THEN 'large' 
             ELSE 'extra_large' END AS size
    FROM 
        products p
    WHERE
        p.product_category_name != 'null'
    GROUP BY
        p.product_category_name, size
    ORDER BY
        p.product_category_name 
'''
table = pd.read_sql_query( query, conn )
table

Unnamed: 0,product_category_name,num_products,size
0,agro_industria_e_comercio,92,extra_large
1,agro_industria_e_comercio,14,large
2,agro_industria_e_comercio,16,medium
3,agro_industria_e_comercio,26,small
4,alimentos,28,extra_large
...,...,...,...
269,telefonia_fixa,120,small
270,utilidades_domesticas,2338,extra_large
271,utilidades_domesticas,724,large
272,utilidades_domesticas,908,medium


In [16]:
query = '''
    SELECT 
        o.order_id,
        oi.order_item_id,
        oi.price,
        p.product_id,
        p.product_category_name,
        op.payment_type,
        op.payment_value,
        c.customer_id,
        c.customer_city
    FROM
        orders o INNER JOIN order_items oi    ON ( oi.order_id = o.order_id)
                 INNER JOIN products p        ON ( p.product_id = oi.product_id )
                 LEFT JOIN order_payments op  ON ( op.order_id = o.order_id )
                 LEFT JOIN customers c        ON ( c.customer_id = o.customer_id )
    LIMIT 10
'''
table = pd.read_sql_query( query, conn )
table

Unnamed: 0,order_id,order_item_id,price,product_id,product_category_name,payment_type,payment_value,customer_id,customer_city
0,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,credit_card,18.12,9ef432eb6251297304e76186b10a928d,sao paulo
1,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,credit_card,18.12,9ef432eb6251297304e76186b10a928d,sao paulo
2,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,credit_card,18.12,9ef432eb6251297304e76186b10a928d,sao paulo
3,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,credit_card,18.12,9ef432eb6251297304e76186b10a928d,sao paulo
4,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,voucher,2.0,9ef432eb6251297304e76186b10a928d,sao paulo
5,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,voucher,2.0,9ef432eb6251297304e76186b10a928d,sao paulo
6,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,voucher,2.0,9ef432eb6251297304e76186b10a928d,sao paulo
7,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,voucher,2.0,9ef432eb6251297304e76186b10a928d,sao paulo
8,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,voucher,18.59,9ef432eb6251297304e76186b10a928d,sao paulo
9,e481f51cbdc54678b7cc49136f2d6af7,1,29.99,87285b34884572647811a353c7ac498a,utilidades_domesticas,voucher,18.59,9ef432eb6251297304e76186b10a928d,sao paulo


In [17]:
# Subqueries
query = '''
    WITH product_count AS (
        SELECT 
            oi.order_id,
            count( oi.product_id ) AS num_product
        FROM 
            order_items oi
        GROUP BY
            order_id 
    ), order_customer AS (
        SELECT
            order_id,
            customer_id
        FROM
            orders o
    ), customer_state AS (
        SELECT
            customer_id,
            customer_state
        FROM
            customers c
    )
    
    SELECT
        cs.customer_id,
        cs.customer_state,
        pc.num_product
    FROM
        customer_state cs INNER JOIN order_customer oc ON ( oc.customer_id = cs.customer_id )
                          INNER JOIN product_count pc  ON ( pc.order_id = oc.order_id )
    ORDER BY
        pc.num_product DESC
'''
table = pd.read_sql_query ( query, conn )
table

Unnamed: 0,customer_id,customer_state,num_product
0,fc3d1daec319d62d49bfb5e1f83123e9,SP,42
1,fc3d1daec319d62d49bfb5e1f83123e9,SP,42
2,fc3d1daec319d62d49bfb5e1f83123e9,SP,42
3,fc3d1daec319d62d49bfb5e1f83123e9,SP,42
4,be1b70680b9f9694d8c70f41fa3dc92b,SP,40
...,...,...,...
394659,b5e6afd5a41800fdf401e0272ca74655,SP,2
394660,96d649da0cc4ff33bb408b199d4c7dcf,SP,2
394661,96d649da0cc4ff33bb408b199d4c7dcf,SP,2
394662,96d649da0cc4ff33bb408b199d4c7dcf,SP,2


In [18]:
# Closing connection
conn.close()