In [2]:
%load_ext sql
import os
import urllib.parse

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [11]:
host = 'localhost'
user = 'postgres'
db_name = 'olist'
password = urllib.parse.quote_from_bytes('CHiheb 10'.encode())
conn_string = f"postgresql://{user}:{password}@{host}/{db_name}"

In [12]:
%sql $conn_string

#  Exploration

In [21]:
%%sql
select * from orders limit 4;

 * postgresql://postgres:***@localhost/olist
4 rows affected.


order_id,customer_id,order_status,order_purchase,order_approved,order_delivered_carrier,order_delivered_customer,order_estimated_delivery
e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00


In [18]:
%%sql
select distinct(date_part('year',order_delivered_customer)::int) from orders;

 * postgresql://postgres:***@localhost/olist
4 rows affected.


date_part
2016.0
2017.0
2018.0
""


In [24]:
%%sql 
select distinct(order_status)
from orders;

 * postgresql://postgres:***@localhost/olist
8 rows affected.


order_status
shipped
unavailable
invoiced
created
approved
processing
delivered
canceled


#  Frequency of orders delivered every month

In [27]:
%%sql
select 
        date_part('year',order_delivered_customer)::int as year,
        date_part('month', order_delivered_customer)::int as month,
        count(*) as delivered_orders
from orders
where order_status = 'delivered'
and order_delivered_customer is not null
group by year, month
order by year asc ,month asc;

 * postgresql://postgres:***@localhost/olist
25 rows affected.


year,month,delivered_orders
2016,10,205
2016,11,58
2016,12,4
2017,1,283
2017,2,1351
2017,3,2382
2017,4,1849
2017,5,3751
2017,6,3223
2017,7,3455


# Top 5 cities with most number of orders

In [32]:
%%sql

select c.customer_state,c.customer_city, count(o.order_id) as orders_num
from customers c
join orders o using(customer_id)
group by c.customer_state,c.customer_city
order by orders_num desc limit 5;

 * postgresql://postgres:***@localhost/olist
5 rows affected.


customer_state,customer_city,orders_num
SP,sao paulo,15540
RJ,rio de janeiro,6882
MG,belo horizonte,2773
DF,brasilia,2131
PR,curitiba,1521


#  Deliveries by hour


In [34]:
%%sql
select 
        date_part('hour',order_delivered_customer)::int as hour,
        count(*) as delivered_orders
from orders
where order_status = 'delivered'
and order_delivered_customer is not null
group by hour
order by hour asc;

 * postgresql://postgres:***@localhost/olist
24 rows affected.


hour,delivered_orders
0,2885
1,1515
2,649
3,260
4,187
5,198
6,269
7,396
8,779
9,1196


#  Average difference between estimated and actual delivery (days)                by month


In [54]:
%%sql 

with month_estimation
as 
    (
        select date_part('month',order_delivered_customer)::int as month,
        abs(date_part('day',order_estimated_delivery-order_delivered_customer)) as difference
        from orders
        where order_delivered_customer is not null
        and order_status = 'delivered'
    )
select month,avg(difference)::real avg_diff
from month_estimation
group by month
order by avg_diff desc;

 * postgresql://postgres:***@localhost/olist
12 rows affected.


month,avg_diff
6,15.882909
2,14.489932
1,14.1286335
10,13.635262
7,12.740585
5,12.378843
4,12.180327
9,11.855011
12,11.545568
11,10.69522


# Average difference between order and delivery time  (days)    by state


In [53]:
%%sql
with state_difference
as (
    select c.customer_state, 
    date_part('day',order_delivered_customer - order_purchase) as diff
    from customers c
    join orders o on(c.customer_id = o.customer_id)
    where order_status = 'delivered'
    and order_delivered_customer is not null

)
select customer_state, avg(diff)::real as avg_diff
        from state_difference
        group by customer_state
        order by avg_diff desc;

 * postgresql://postgres:***@localhost/olist
27 rows affected.


customer_state,avg_diff
RR,28.975609
AP,26.731344
AM,25.986206
AL,24.040302
PA,23.316067
MA,21.117155
SE,21.02985
CE,20.817827
AC,20.6375
PB,19.953579
