In [72]:
# --- Celda 1: Imports y carga ---
from pathlib import Path
import pandas as pd

BASE = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
PROC = BASE / "data" / "processed"
OUT  = PROC / "dash"
OUT.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(PROC / "olist_final_dataset.csv", parse_dates=[
    "order_purchase_timestamp","order_delivered_customer_date","order_estimated_delivery_date"
])

print(df.shape)
df.head(3)


(112650, 26)


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,order_status,order_purchase_timestamp,order_approved_at,...,is_late,shipping_days_limit,purchase_year,purchase_month,purchase_day,purchase_weekday,purchase_hour,total_price,order_status_simple,order_line_uid
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,delivered,2017-09-13 08:59:02,2017-09-13 09:45:35,...,0,6.0,2017,9,13,2,8,72.19,delivered,00010242fe8c5a6d1ba2dd792cb16214-1
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93,delivered,2017-04-26 10:53:06,2017-04-26 11:05:13,...,0,7.0,2017,4,26,2,10,259.83,delivered,00018f77f2f0320c557190d7a144bdd3-1
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87,delivered,2018-01-14 14:33:31,2018-01-14 14:48:30,...,0,4.0,2018,1,14,6,14,216.87,delivered,000229ec398224ef6ca0657da4fc703e-1


In [73]:
# --- Celda 2: Agregados mensuales (corregida) ---
# Resumen de pedidos y revenue por mes con fecha como datetime

# Convertimos directamente a datetime mensual
df["purchase_month"] = df["order_purchase_timestamp"].dt.to_period("M").dt.to_timestamp()

monthly = (df.groupby("purchase_month")
             .agg(orders=("order_id", "nunique"),
                  revenue=("total_price", "sum"),
                  on_time_rate=("is_late", lambda x: 1 - x.mean()))
             .reset_index())

# Redondeo para formato profesional
monthly["on_time_rate"] = monthly["on_time_rate"].round(3)
monthly["revenue"] = monthly["revenue"].round(2)

# Confirmamos que purchase_month es datetime (por si se pierde en el proceso)
monthly["purchase_month"] = pd.to_datetime(monthly["purchase_month"])

# Exportación final
monthly.to_csv(OUT / "monthly_summary.csv", index=False)
monthly.head()


Unnamed: 0,purchase_month,orders,revenue,on_time_rate
0,2016-09-01,3,354.75,0.5
1,2016-10-01,308,56808.84,0.992
2,2016-12-01,1,19.62,1.0
3,2017-01-01,789,137188.49,0.973
4,2017-02-01,1733,286280.62,0.968


In [74]:
# --- Celda 3: Agregados por seller ---
seller = (df.groupby("seller_id")
            .agg(orders=("order_id", "nunique"),
                 revenue=("total_price", "sum"),
                 on_time_rate=("is_late", lambda x: 1 - x.mean()))
            .reset_index()
            .sort_values("revenue", ascending=False))

# Redondeo
seller["on_time_rate"] = seller["on_time_rate"].round(3)
seller["revenue"] = seller["revenue"].round(2)

seller.to_csv(OUT / "seller_summary.csv", index=False)
seller.head()


Unnamed: 0,seller_id,orders,revenue,on_time_rate
857,4869f7a5dfa277a7dca6462dcf3b52b2,1132,249640.7,0.885
1535,7c67e1448b00f6e969d365cea6b010ab,982,239536.44,0.905
1013,53243585a1d6dc2643021fd1853d8905,358,235856.68,0.961
881,4a3ca9315b744ce9f8e9374361493884,1806,235539.96,0.892
3024,fa1c13f2614d7b5c4749cbc52fecda94,585,204084.73,0.899


In [75]:
# --- Celda 4: Agregados por producto ---
product = (df.groupby("product_id")
             .agg(lines=("order_line_uid", "count"),
                  revenue=("total_price", "sum"))
             .reset_index()
             .sort_values("revenue", ascending=False))

product["revenue"] = product["revenue"].round(2)
product["revenue"] = product["revenue"].map("{:.2f}".format)

product.to_csv(OUT / "product_summary.csv", index=False)
product.head()


Unnamed: 0,product_id,lines,revenue
24086,bb50f2e236e5eea0100680137654686c,195,67606.1
27039,d1c427060a0f73f6b889a5c7c61f2ac4,343,60976.03
14068,6cdd53843498f92890544667809f1595,156,59093.99
19742,99a4788cb24856965c36a24e339b6058,488,51071.6
27613,d6160fb7873f184099d9bc95e30376af,35,50326.18


In [76]:
# --- Celda 5: Agregados por estado de entrega (corregida) ---
status = (df.groupby("order_status_simple")
            .agg(orders=("order_id", "nunique"),
                 revenue=("total_price", "sum"))
            .reset_index())

# Redondeo para formato profesional
status["revenue"] = status["revenue"].round(2)

# Exportación final
status.to_csv(OUT / "status_summary.csv", index=False)
status


Unnamed: 0,order_status_simple,orders,revenue
0,canceled,467,108026.21
1,delivered,96478,15419773.75
2,in_transit,1418,246118.09
3,processing,303,69635.19


In [77]:
# --- Celda 6: Guardado final ---
print("Datasets preparados en:", OUT)
list(OUT.glob("*.csv"))


Datasets preparados en: c:\Users\EvaMera\OneDrive - Conysel 15 S.L\ecommerce-eda\data\processed\dash


[WindowsPath('c:/Users/EvaMera/OneDrive - Conysel 15 S.L/ecommerce-eda/data/processed/dash/monthly_summary.csv'),
 WindowsPath('c:/Users/EvaMera/OneDrive - Conysel 15 S.L/ecommerce-eda/data/processed/dash/product_summary.csv'),
 WindowsPath('c:/Users/EvaMera/OneDrive - Conysel 15 S.L/ecommerce-eda/data/processed/dash/seller_summary.csv'),
 WindowsPath('c:/Users/EvaMera/OneDrive - Conysel 15 S.L/ecommerce-eda/data/processed/dash/status_summary.csv')]