In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.model_selection import train_test_split

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [21]:
# =========================
DATA_PATH = Path("data/processed/weekly_item_shop.csv")
KEYS = ["shop_id", "item_id"]
TIME = "week_start"
TARGET = "y_units"

BASE_FEATURES = ["shop_id", "item_id", "sales", "avg_price", "active_days"]

LAGS = [1, 2, 4, 8]
ROLLS = [4, 8]  # ventanas en semanas



In [22]:
# 1) Load
# =========================
df = pd.read_csv(DATA_PATH)

# Tipos
df[TIME] = pd.to_datetime(df[TIME], errors="coerce")
df = df.dropna(subset=[TIME])

# Orden y sanity
df = df.sort_values(KEYS + [TIME]).reset_index(drop=True)

# Clipping (concepto: unidades no negativas)
df[TARGET] = df[TARGET].clip(lower=0)

# =========================

In [23]:
# 2) Baseline naive (t-1 por shop-item)
# =========================
df["naive_pred"] = df.groupby(KEYS)[TARGET].shift(1)
df_naive = df.dropna(subset=["naive_pred"]).copy()

mae_naive = mean_absolute_error(df_naive[TARGET], df_naive["naive_pred"])
rmse_naive = np.sqrt(mean_squared_error(df_naive[TARGET], df_naive["naive_pred"]))
print(f"NAIVE -> MAE: {mae_naive:.6f}  RMSE: {rmse_naive:.6f}")

# =========================


NAIVE -> MAE: 0.633836  RMSE: 4.031630


In [None]:
# 3) Feature engineering (Opción B: MultiIndex)
# =========================
df_mi = df.set_index(KEYS + [TIME]).sort_index()

# 3.1 Lags
for lag in LAGS:
    df_mi[f"lag_{lag}"] = df_mi.groupby(level=KEYS)[TARGET].shift(lag)

# 3.2 Rolling mean SOLO pasado (shift(1) antes de rolling)
for w in ROLLS:
    df_mi[f"roll_mean_{w}"] = (
        df_mi.groupby(level=KEYS)[TARGET]
             .shift(1)
             .rolling(window=w, min_periods=w)
             .mean()
    )

# Volvemos a columnas normales
df_feat = df_mi.reset_index()

# =========================
# 4) Dataset final de modelado
# =========================
feature_cols = (
    BASE_FEATURES
    + [f"lag_{l}" for l in LAGS]
    + [f"roll_mean_{w}" for w in ROLLS]
)

need_cols = [TARGET] + feature_cols
df_model = df_feat.dropna(subset=need_cols).copy()

# (opcional) clipping de features que no deberían ser negativas
# df_model["sales"] = df_model["sales"].clip(lower=0)
# df_model["avg_price"] = df_model["avg_price"].clip(lower=0)
# df_model["active_days"] = df_model["active_days"].clip(lower=0)

print("df_model:", df_model.shape)
print("features:", len(feature_cols))



df_model: (773363, 14)
features: 11


In [26]:
cutoff = df_model[TIME].quantile(0.8)

train_df = df_model[df_model[TIME] <= cutoff].copy()
valid_df = df_model[df_model[TIME] > cutoff].copy()

print("train:", train_df.shape, "valid:", valid_df.shape)

# checks
assert train_df[TIME].max() <= cutoff
assert valid_df[TIME].min() > cutoff
print("OK: split temporal limpio")


train: (620370, 14) valid: (152993, 14)
OK: split temporal limpio


In [27]:

X_train = train_df[feature_cols]
y_train = train_df[TARGET]

X_valid = valid_df[feature_cols]
y_valid = valid_df[TARGET]

# =========================
# 6) Train model (LightGBM con fallback)
# =========================
pred = None

try:
    import lightgbm as lgb

    model = lgb.LGBMRegressor(
        n_estimators=5000,
        learning_rate=0.03,
        num_leaves=63,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
    )

    pred = model.predict(X_valid)

except Exception as e:
    print("LightGBM no disponible o falló. Fallback -> GradientBoostingRegressor")
    from sklearn.ensemble import GradientBoostingRegressor

    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_train, y_train)
    pred = model.predict(X_valid)

# Clip de predicción (no puedes vender unidades negativas)
pred = np.clip(pred, 0, None)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1834
[LightGBM] [Info] Number of data points in the train set: 620370, number of used features: 11
[LightGBM] [Info] Start training from score 1.817570


In [28]:
mae = mean_absolute_error(y_valid, pred)
rmse = np.sqrt(mean_squared_error(y_valid, pred))

print(f"MODEL -> MAE: {mae:.6f}  RMSE: {rmse:.6f}")
print(f"IMPROVEMENT vs NAIVE -> ΔMAE: {mae_naive - mae:.6f}  ΔRMSE: {rmse_naive - rmse:.6f}")

MODEL -> MAE: 0.122647  RMSE: 6.700196
IMPROVEMENT vs NAIVE -> ΔMAE: 0.511188  ΔRMSE: -2.668566


In [29]:
pred = np.clip(pred, 0, None)


In [30]:
errors = np.abs(y_valid - pred)
errors.describe(percentiles=[0.9, 0.95, 0.99])


count    152993.000000
mean          0.122647
std           6.699096
min           0.000000
90%           0.117997
95%           0.235220
99%           0.932746
max        2158.200470
Name: y_units, dtype: float64

In [34]:
# 1) transforma target
y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid)

# 2) entrena con y_log (ojo aquí)
model.fit(X_train, y_train_log)

# 3) predice en log
pred_log = model.predict(X_valid)

# 4) regresa a escala original
pred = np.expm1(pred_log)

# 5) clip a 0 (unidades no negativas)
pred = np.clip(pred, 0, None)

# 6) métricas en escala original
mae  = mean_absolute_error(y_valid, pred)
rmse = np.sqrt(mean_squared_error(y_valid, pred))

print(f"MODEL -> MAE: {mae:.6f}  RMSE: {rmse:.6f}")
print(f"IMPROVEMENT vs NAIVE -> ΔMAE: {mae_naive - mae:.6f}  ΔRMSE: {rmse_naive - rmse:.6f}")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1834
[LightGBM] [Info] Number of data points in the train set: 620370, number of used features: 11
[LightGBM] [Info] Start training from score 0.878815
MODEL -> MAE: 0.079733  RMSE: 6.984675
IMPROVEMENT vs NAIVE -> ΔMAE: 0.554103  ΔRMSE: -2.953044


In [35]:
from pathlib import Path
import json
import joblib
import numpy as np

ART_DIR = Path("models")
ART_DIR.mkdir(exist_ok=True)

MODEL_PATH = ART_DIR / "lgbm_weekly_v1.pkl"
INFO_PATH  = ART_DIR / "lgbm_weekly_v1_info.json"

# feature_cols = lista final de features usadas
# model = modelo entrenado
joblib.dump(model, MODEL_PATH)

info = {
    "model_path": str(MODEL_PATH),
    "target": TARGET,
    "log_target": True,              # si estás usando log1p
    "clip_pred_min": 0,
    "features": feature_cols,
    "n_features": len(feature_cols),
    "metrics": {
        "mae": float(mae),
        "rmse": float(rmse),
        "mae_naive": float(mae_naive),
        "rmse_naive": float(rmse_naive),
    }
}
INFO_PATH.write_text(json.dumps(info, indent=2), encoding="utf-8")

print("Saved model:", MODEL_PATH)
print("Saved info :", INFO_PATH)

Saved model: models/lgbm_weekly_v1.pkl
Saved info : models/lgbm_weekly_v1_info.json
