In [6]:
from pathlib import Path
import pandas as pd

# Definir rutas base
BASE_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
RAW_DIR = BASE_DIR / "data" / "raw"

# Definir rutas de archivos
FILES = {
    "orders": RAW_DIR / "olist_orders_dataset.csv",
    "items": RAW_DIR / "olist_order_items_dataset.csv",
    "products": RAW_DIR / "olist_products_dataset.csv",
    "sellers": RAW_DIR / "olist_sellers_dataset.csv",
}

# Verificación rápida
for name, path in FILES.items():
    print(f"{name} exists: {path.exists()} → {path}")


orders exists: True → /Users/emmeravivar/Desktop/ecommerce-eda/data/raw/olist_orders_dataset.csv
items exists: True → /Users/emmeravivar/Desktop/ecommerce-eda/data/raw/olist_order_items_dataset.csv
products exists: True → /Users/emmeravivar/Desktop/ecommerce-eda/data/raw/olist_products_dataset.csv
sellers exists: True → /Users/emmeravivar/Desktop/ecommerce-eda/data/raw/olist_sellers_dataset.csv


In [7]:
# Columnas de fecha para orders
parse_dates = [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]

# Cargar datasets
orders = pd.read_csv(FILES["orders"], parse_dates=parse_dates, dtype={
    "order_id": "string", 
    "customer_id": "string", 
    "order_status": "category"
})

items = pd.read_csv(FILES["items"], dtype={
    "order_id": "string",
    "order_item_id": "int16",
    "product_id": "string",
    "seller_id": "string",
    "price": "float32",
    "freight_value": "float32",
})

items["shipping_limit_date"] = pd.to_datetime(items["shipping_limit_date"], errors="coerce")

orders.shape, items.shape
# Columnas de fecha para orders
parse_dates = [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]

# Cargar datasets
orders = pd.read_csv(FILES["orders"], parse_dates=parse_dates, dtype={
    "order_id": "string", 
    "customer_id": "string", 
    "order_status": "category"
})

items = pd.read_csv(FILES["items"], dtype={
    "order_id": "string",
    "order_item_id": "int16",
    "product_id": "string",
    "seller_id": "string",
    "price": "float32",
    "freight_value": "float32",
})

items["shipping_limit_date"] = pd.to_datetime(items["shipping_limit_date"], errors="coerce")

orders.shape, items.shape


((99441, 8), (112650, 7))

In [8]:
# Verifica unicidad de claves primarias
orders_key_unique = orders["order_id"].is_unique
items_key_unique  = items[["order_id", "order_item_id"]].drop_duplicates().shape[0] == items.shape[0]

# Porcentaje de valores nulos
nulls_orders = orders.isna().mean().sort_values(ascending=False).head(10)
nulls_items  = items.isna().mean().sort_values(ascending=False).head(10)

orders_key_unique, items_key_unique, nulls_orders, nulls_items


(True,
 True,
 order_delivered_customer_date    0.029817
 order_delivered_carrier_date     0.017930
 order_approved_at                0.001609
 order_id                         0.000000
 customer_id                      0.000000
 order_status                     0.000000
 order_purchase_timestamp         0.000000
 order_estimated_delivery_date    0.000000
 dtype: float64,
 order_id               0.0
 order_item_id          0.0
 product_id             0.0
 seller_id              0.0
 shipping_limit_date    0.0
 price                  0.0
 freight_value          0.0
 dtype: float64)

In [9]:
# Fusionamos order_items con orders y enriquecemos
df = items.merge(
    orders[["order_id", "order_status", "order_purchase_timestamp", "order_approved_at",
            "order_delivered_carrier_date", "order_delivered_customer_date",
            "order_estimated_delivery_date", "customer_id"]],
    on="order_id", how="left", validate="many_to_one"
)

df.shape, df.head(3)


((112650, 14),
                            order_id  order_item_id  \
 0  00010242fe8c5a6d1ba2dd792cb16214              1   
 1  00018f77f2f0320c557190d7a144bdd3              1   
 2  000229ec398224ef6ca0657da4fc703e              1   
 
                          product_id                         seller_id  \
 0  4244733e06e7ecb4970a6e2683c13e61  48436dade18ac8b2bce089ec2a041202   
 1  e5f2d52b802189ee658865ca93d83a8f  dd7ddc04e1b6c2c614352b383efe2d36   
 2  c777355d18b72b67abbeef9df44fd0fd  5b51032eddd242adc84c38acab88f23d   
 
   shipping_limit_date       price  freight_value order_status  \
 0 2017-09-19 09:45:35   58.900002      13.290000    delivered   
 1 2017-05-03 11:05:13  239.899994      19.930000    delivered   
 2 2018-01-18 14:48:30  199.000000      17.870001    delivered   
 
   order_purchase_timestamp   order_approved_at order_delivered_carrier_date  \
 0      2017-09-13 08:59:02 2017-09-13 09:45:35          2017-09-19 18:34:16   
 1      2017-04-26 10:53:06 2017-04-26 

In [10]:
# Guardamos dataset enriquecido para análisis
INTERIM = BASE_DIR / "data" / "interim"
INTERIM.mkdir(parents=True, exist_ok=True)

df.to_parquet(INTERIM / "order_items_join_orders.parquet", index=False)
print("✅ Archivo guardado:", INTERIM / "order_items_join_orders.parquet")


✅ Archivo guardado: /Users/emmeravivar/Desktop/ecommerce-eda/data/interim/order_items_join_orders.parquet
