In [1]:
import pandas as pd
from pathlib import Path

DATA_RAW = Path("/Users/dishasanthosh/Desktop/data_raw") 

orders = pd.read_csv(
    DATA_RAW / "olist_orders_dataset.csv",
    parse_dates=[
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
    ]
)

orders.head()


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26


In [2]:
o = orders.copy()

o["purchase_date"] = o["order_purchase_timestamp"].dt.date
o["purchase_month"] = o["order_purchase_timestamp"].dt.to_period("M").astype(str)

o["is_delivered"] = o["order_status"].eq("delivered")
o["is_canceled"] = o["order_status"].isin(["canceled", "unavailable"])

# delivery_days: only meaningful if delivered date exists
o["delivery_days"] = (o["order_delivered_customer_date"] - o["order_purchase_timestamp"]).dt.total_seconds() / (60*60*24)

# processing/shipping proxy (varies by availability)
start_ts = o["order_approved_at"].fillna(o["order_purchase_timestamp"])
o["ship_days_to_carrier"] = (o["order_delivered_carrier_date"] - start_ts).dt.total_seconds() / (60*60*24)

# late delivery: delivered after estimated
o["is_late"] = (o["order_delivered_customer_date"] > o["order_estimated_delivery_date"])

o[["order_id","order_status","delivery_days","ship_days_to_carrier","is_late","purchase_month"]].head()


Unnamed: 0,order_id,order_status,delivery_days,ship_days_to_carrier,is_late,purchase_month
0,e481f51cbdc54678b7cc49136f2d6af7,delivered,8.436574,2.366493,False,2017-10
1,53cdb2fc8bc7dce0b6741e2150273451,delivered,13.782037,0.462882,False,2018-07
2,47770eb9100c2d0c44946d9cf07ec65d,delivered,9.394213,0.204595,False,2018-08
3,949d5b44dbf5de918fe9c16f97b45f8a,delivered,13.20875,3.745833,False,2017-11
4,ad21c59c0840e6cb83a9ceb5573f8159,delivered,2.873877,0.893113,False,2018-02


A) How many orders are delivered vs canceled?

In [3]:
o["order_status"].value_counts(dropna=False)

order_status
delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: count, dtype: int64

B) Late delivery rate (only among delivered)

In [4]:
late_rate = o.loc[o["is_delivered"], "is_late"].mean()
late_rate

np.float64(0.08111693857667032)

C) Any weird delivery times? (negative or huge)

In [5]:
o["delivery_days"].describe()


count    96476.000000
mean        12.558702
std          9.546530
min          0.533414
25%          6.766403
50%         10.217755
75%         15.720327
max        209.628611
Name: delivery_days, dtype: float64

In [6]:
fact_orders = o[[
    "order_id",
    "customer_id",
    "order_status",
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
    "purchase_month",
    "is_delivered",
    "is_canceled",
    "is_late",
    "delivery_days",
    "ship_days_to_carrier"
]].copy()

fact_orders.head()


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,purchase_month,is_delivered,is_canceled,is_late,delivery_days,ship_days_to_carrier
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,2017-10,True,False,False,8.436574,2.366493
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,2018-07,True,False,False,13.782037,0.462882
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,2018-08,True,False,False,9.394213,0.204595
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,2017-11,True,False,False,13.20875,3.745833
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,2018-02,True,False,False,2.873877,0.893113


In [7]:
out = Path("../data_processed")
out.mkdir(parents=True, exist_ok=True)

fact_orders.to_csv(out / "fact_orders.csv", index=False)
print("Saved:", out / "fact_orders.csv")


Saved: ../data_processed/fact_orders.csv
