In [8]:
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import mlflow
from mlflow.models import infer_signature


df = pd.read_csv('daily_demand.csv', parse_dates = ['fecha'])
df['dia_festivo'].astype('category')
df['dia_semana'].astype('category')
df.drop(['monto_total','semana_ano', 'mes', 'trimestre', 'ano','semana_inicio'], axis=1, inplace=True)
horizon = 30
test_df = df.groupby('platillo_id').tail(horizon)
train_df = df.drop(test_df.index)
train_df.shape, test_df.shape

stats = []
for pid, g in train_df.groupby('platillo_id'):
    s = g.sort_values('fecha')['cantidad']
    stats.append({
        'platillo_id':    pid,
        'mean_sales':     s.mean(),
        'std_sales':      s.std(),
        'var_sales':      s.var(),
        'cv_sales':       s.std()/s.mean() if s.mean() else 0,
        'spike_count':    (s > s.mean() + 2*s.std()).sum(),
        'zero_days_ratio':(s == 0).mean(),
    })
stats_df = pd.DataFrame(stats)

num_cols = ['mean_sales','std_sales','var_sales','cv_sales','spike_count','zero_days_ratio']
scaler = StandardScaler().fit(stats_df[num_cols].fillna(0))
X_train_stats = scaler.transform(stats_df[num_cols].fillna(0))

pca_dummy = PCA().fit(X_train_stats)
cum_var = np.cumsum(pca_dummy.explained_variance_ratio_)
k = np.searchsorted(cum_var, 0.95) + 1
pca = PCA(n_components=k, random_state=42).fit(X_train_stats)
emb_train = pca.transform(X_train_stats)

emb_df_train = pd.DataFrame(
    emb_train,
    columns=[f'pca_emb_{i}' for i in range(k)]
)
emb_df_train['platillo_id'] = stats_df['platillo_id']

train_df = train_df.merge(emb_df_train, on='platillo_id')
test_df = test_df.merge(emb_df_train, on='platillo_id')

feature_cols = ['lag_1','lag_7','ocupacion','dia_semana','dia_festivo'] + [f'pca_emb_{i}' for i in range(k)]

In [27]:
params = {'objective':'tweedie','metric':'rmse','verbosity':-1, "bagging_fraction": 0.8,
  "feature_fraction": 0.8,
  "lambda_l1": 0.1,
  "lambda_l2": 0,
  "learning_rate": 0.05,
  "max_depth": -1,
  "num_leaves": 31,
  }

# 7) Definir matrices de features y target
X_train = train_df[feature_cols]
y_train = train_df['cantidad']
X_test  = test_df[feature_cols]
y_test  = test_df['cantidad']

model = LGBMRegressor(**params)
model.fit(
    X_train, 
    y_train, 
    eval_set=[(X_test, y_test)], 
    eval_metric='rmse'
)

In [28]:
preds = model.predict(X_test)
mae  = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"MAE en últimos {horizon} días: {mae:.2f}")
print(f"RMSE en últimos {horizon} días: {rmse:.2f}")
test_df['preds'] = preds

MAE en últimos 30 días: 3.88
RMSE en últimos 30 días: 7.68


In [29]:
preds_train = model.predict(X_train)
preds_df = pd.DataFrame(preds_train, columns=["cantidad"]) 
signature = infer_signature(X_train, preds_df)



In [30]:
mlflow.set_tracking_uri("http://localhost:5001")
mlflow.set_experiment("Oumaji_Demand")
with mlflow.start_run():
    # parámetros
    mlflow.log_params({f"data.{k}": v for k, v in params.items()})
    # métricas
    mlflow.log_metrics({
        "MAE": mae,
        "RMSE": rmse
    })
    mlflow.lightgbm.log_model(
        lgb_model = model,
        artifact_path="model",
        signature=signature,
        input_example = X_train,
        registered_model_name = "LightGBM_PCA"
    )    

Registered model 'LightGBM_PCA' already exists. Creating a new version of this model...
2025/05/24 13:56:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LightGBM_PCA, version 5


🏃 View run nimble-robin-546 at: http://localhost:5001/#/experiments/400716610318719328/runs/1c4990b825014f1088af68cf9e35e143
🧪 View experiment at: http://localhost:5001/#/experiments/400716610318719328


Created version '5' of model 'LightGBM_PCA'.


In [184]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error

# 1) Prepara tus datos ya con embeddings y lags:
#    X = train_df[feature_cols]
#    y = train_df['cantidad']

# 2) Define el modelo base
model = LGBMRegressor(objective='regression', verbosity=-1)

# 3) Define la malla de parámetros
param_grid = {
    'learning_rate'    : [0.01, 0.05, 0.1],
    'num_leaves'       : [31, 63, 127],
    'max_depth'        : [5, 10, -1],
    'feature_fraction' : [0.6, 0.8, 1.0],
    'bagging_fraction' : [0.6, 0.8, 1.0],
    'lambda_l1'        : [0, 0.1, 1.0],
    'lambda_l2'        : [0, 0.1, 1.0],
}

# 4) Prepara la métrica (RMSE positivo)
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
                          greater_is_better=False)

# 5) Crea el GridSearch
gs = GridSearchCV(
    estimator     = model,
    param_grid    = param_grid,
    scoring       = rmse_scorer,
    cv            = 3,
    n_jobs        = -1,
    verbose       = 1,
    return_train_score = True
)

X = train_df[feature_cols]
y = train_df['cantidad']

# 6) Ejecuta la búsqueda
gs.fit(X, y)

# 7) Extrae los 5 mejores
results_df = pd.DataFrame(gs.cv_results_)
# 'mean_test_score' está en negativo RMSE, así que buscamos los valores más altos (menos negativos)
top5 = results_df.sort_values('mean_test_score', ascending=False).head(5)

# 8) Muestra los parámetros y el RMSE correspondiente
print(top5[['params', 'mean_test_score', 'std_test_score']])


Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
                                                 params  mean_test_score  \
1149  {'bagging_fraction': 0.8, 'feature_fraction': ...        -7.515016   
1878  {'bagging_fraction': 1.0, 'feature_fraction': ...        -7.515016   
420   {'bagging_fraction': 0.6, 'feature_fraction': ...        -7.515016   
339   {'bagging_fraction': 0.6, 'feature_fraction': ...        -7.521732   
1068  {'bagging_fraction': 0.8, 'feature_fraction': ...        -7.521732   

      std_test_score  
1149        1.580750  
1878        1.580750  
420         1.580750  
339         1.577439  
1068        1.577439  


In [191]:
for idx in range(len(top5)):
    row = top5.iloc[idx]
    print(f"\nModelo {idx+1}:")
    for p_name, p_val in row['params'].items():
        print(f"  {p_name}: {p_val}")


Modelo 1:
  bagging_fraction: 0.8
  feature_fraction: 0.8
  lambda_l1: 1.0
  lambda_l2: 0
  learning_rate: 0.05
  max_depth: -1
  num_leaves: 31

Modelo 2:
  bagging_fraction: 1.0
  feature_fraction: 0.8
  lambda_l1: 1.0
  lambda_l2: 0
  learning_rate: 0.05
  max_depth: -1
  num_leaves: 31

Modelo 3:
  bagging_fraction: 0.6
  feature_fraction: 0.8
  lambda_l1: 1.0
  lambda_l2: 0
  learning_rate: 0.05
  max_depth: -1
  num_leaves: 31

Modelo 4:
  bagging_fraction: 0.6
  feature_fraction: 0.8
  lambda_l1: 0.1
  lambda_l2: 0
  learning_rate: 0.05
  max_depth: -1
  num_leaves: 31

Modelo 5:
  bagging_fraction: 0.8
  feature_fraction: 0.8
  lambda_l1: 0.1
  lambda_l2: 0
  learning_rate: 0.05
  max_depth: -1
  num_leaves: 31
