In [1]:
import pandas as pd
from pathlib import Path

DATA_PROCESSED = Path("../data_processed")

df = pd.read_csv(
    DATA_PROCESSED / "fact_orders_revenue.csv",
    parse_dates=["order_purchase_timestamp"]
)

df.head()


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,purchase_month,is_delivered,is_canceled,is_late,delivery_days,ship_days_to_carrier,items_revenue,items_count,payment_value,payment_types
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,2017-10,True,False,False,8.436574,2.366493,38.71,1.0,38.71,"credit_card,voucher"
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,2018-07,True,False,False,13.782037,0.462882,141.46,1.0,141.46,boleto
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,2018-08,True,False,False,9.394213,0.204595,179.12,1.0,179.12,credit_card
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,2017-11,True,False,False,13.20875,3.745833,72.2,1.0,72.2,credit_card
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,2018-02,True,False,False,2.873877,0.893113,28.62,1.0,28.62,credit_card


In [2]:
total_orders = df["order_id"].nunique()
delivered_orders = df.loc[df["is_delivered"], "order_id"].nunique()
canceled_orders = df.loc[df["is_canceled"], "order_id"].nunique()

funnel = pd.DataFrame({
    "stage": ["Purchased", "Delivered", "Canceled"],
    "orders": [total_orders, delivered_orders, canceled_orders]
})

funnel


Unnamed: 0,stage,orders
0,Purchased,99441
1,Delivered,96478
2,Canceled,1234


Monthly Order + GMV trends

In [3]:
df["month"] = df["order_purchase_timestamp"].dt.to_period("M").astype(str)

monthly = (
    df.groupby("month")
    .agg(
        orders=("order_id", "nunique"),
        delivered_orders=("is_delivered", "sum"),
        gmv=("items_revenue", "sum"),
        avg_order_value=("items_revenue", "mean")
    )
    .reset_index()
)

monthly.head()


Unnamed: 0,month,orders,delivered_orders,gmv,avg_order_value
0,2016-09,4,1,354.75,118.25
1,2016-10,324,265,56808.84,184.444286
2,2016-12,1,1,19.62,19.62
3,2017-01,800,750,137188.49,173.876413
4,2017-02,1780,1653,286280.62,165.193664


Cancellation and late delivery trends

In [4]:
ops = (
    df.groupby("month")
    .agg(
        cancellation_rate=("is_canceled", "mean"),
        late_delivery_rate=("is_late", "mean"),
        avg_delivery_days=("delivery_days", "mean")
    )
    .reset_index()
)

ops.head()


Unnamed: 0,month,cancellation_rate,late_delivery_rate,avg_delivery_days
0,2016-09,0.5,0.25,54.813194
1,2016-10,0.095679,0.009259,19.578572
2,2016-12,0.0,0.0,4.693021
3,2017-01,0.01625,0.02875,12.647044
4,2017-02,0.034831,0.029775,13.168825


Late delivery vs. Cancellations


In [5]:
df.groupby("is_late")["is_canceled"].mean()


is_late
False    0.013459
True     0.000128
Name: is_canceled, dtype: float64

In [6]:
monthly.to_csv(DATA_PROCESSED / "monthly_kpis.csv", index=False)
ops.to_csv(DATA_PROCESSED / "ops_kpis.csv", index=False)

print("Saved monthly_kpis.csv and ops_kpis.csv")


Saved monthly_kpis.csv and ops_kpis.csv
