In [None]:
import os
os.chdir('..')

# Importación de librerias

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
import numpy as np
import optuna
from scripts import *
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
plt.rcParams['figure.figsize'] = (12, 6)
sns.set(style='whitegrid', palette='muted', font_scale=1.1)
sns.set_context("notebook", rc={"figure.figsize": (12, 6)})

In [None]:
n_trials = 50
n_jobs = 1

In [None]:
full_dataset_path = './datasets/full_dataset.parquet'

In [None]:
future_periods = ['201911', '201912']
test_periods = ['201910']

# Carga de datasets

In [None]:
df_full = pd.read_parquet(full_dataset_path)

In [None]:
df_full['cat1'] = df_full['cat1'].astype('category')
df_full['cat2'] = df_full['cat2'].astype('category')
df_full['cat3'] = df_full['cat3'].astype('category')
df_full['brand'] = df_full['brand'].astype('category')
df_full['sku_size'] = df_full['sku_size'].astype('category')

In [None]:
df_train_test = df_full[~df_full['periodo'].isin(future_periods)]

In [None]:
df_train = df_train_test[~df_train_test['periodo'].isin(test_periods)]
df_test = df_train_test[df_train_test['periodo'].isin(test_periods)]

In [None]:
df_future = df_full[df_full['periodo'].isin(future_periods)]

In [None]:
del df_full
del df_train_test

In [None]:
features = [col for col in df_train.columns if col not in ['target', 'weight_col', 'w_volumen', 'w_rank', 'w_tn', 'periodo', 'periodo_dt', 'year', 'customer_id', 'product_id', 'customer_id_limited', 'product_id_limited']]
categorical_cols = ['cat1', 'cat2', 'cat3', 'brand', 'sku_size', 'customer_id_limited_encoded', 'product_id_limited_encoded']

In [None]:
X_train = df_train[features]
y_train = df_train['target']

# Entrenar modelo

In [None]:
def objective(trial):
    params = {
        "device": "gpu",
        "objective": "regression",
        "boosting_type": "gbdt", 
        "metric": "rmse",
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        'linear_tree': True,
        'lambda_l1': trial.suggest_float("lambda_l1", 0.0, 1.0),
        'lambda_l2': trial.suggest_float("lambda_l2", 0.0, 1.0),
        'verbose': -1
    }

    try:
        tfe_scores = []
        mape_scores = []

        sorted_periods = sorted(df_train['periodo'].unique())
        
        num_boost_round = trial.suggest_int('num_boost_round', 1000, 3000)
        early_stopping_rounds = trial.suggest_int('early_stopping_rounds', 100, 300)
        train_window = trial.suggest_int('train_window', 6, 24)
        val_window = 2

        # Weights
        w1_alpha = trial.suggest_categorical("w1_alpha", [0, 1])
        w2_alpha = trial.suggest_categorical("w2_alpha", [0, 1])

        weight_all = (
            w1_alpha * df_train["w_rank"] +
            w2_alpha * df_train["w_tn"]
        )

        print(f"Trial {trial.number}")
        print(f"Entrenando {len(sorted_periods) - train_window - val_window + 1} periodos")
        print(f"Num boost round: {num_boost_round}")
        print(f"Early stopping rounds: {early_stopping_rounds}")
        print(f"Train window: {train_window}")
        print(f"Val window: {val_window}")
        print(f"Pesos: w_rank: {w1_alpha} w_tn: {w2_alpha}")    

        for i in range(len(sorted_periods) - train_window - val_window + 1):        
            train_periods = sorted_periods[i : i + train_window]
            val_periods = sorted_periods[i + train_window : i + train_window + val_window]

            train_mask = df_train['periodo'].isin(train_periods)
            val_mask = df_train['periodo'].isin(val_periods)

            X_train_wf = X_train[train_mask]
            y_train_wf = y_train[train_mask]
            X_val_wf = X_train[val_mask]
            y_val_wf = y_train[val_mask]

            if X_train_wf.empty or X_val_wf.empty:
                continue

            train_weights = weight_all.loc[train_mask]
            val_weights = weight_all.loc[val_mask]

            if train_weights.sum() == 0 or val_weights.sum() == 0:
                train_data_wf = lgb.Dataset(
                    X_train_wf,
                    label=y_train_wf,
                    categorical_feature=categorical_cols
                )
                val_data_wf = lgb.Dataset(
                    X_val_wf,
                    label=y_val_wf,
                    categorical_feature=categorical_cols
                )
            else:
                train_data_wf = lgb.Dataset(
                    X_train_wf,
                    label=y_train_wf,
                    weight=train_weights,
                    categorical_feature=categorical_cols
                )
                val_data_wf = lgb.Dataset(
                    X_val_wf,
                    label=y_val_wf,
                    weight=val_weights,
                    categorical_feature=categorical_cols
                )

            model_wf = lgb.train(
                params,
                train_set=train_data_wf,
                valid_sets=[val_data_wf],
                num_boost_round=num_boost_round,
                callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=True)]
            )

            y_pred_wf = model_wf.predict(X_val_wf, num_iteration=model_wf.best_iteration)

            tfe = np.sum(np.abs(y_val_wf - y_pred_wf)) / np.sum(np.abs(y_val_wf))
            tfe_scores.append(abs(tfe))

            mape = np.mean(np.abs((y_val_wf - y_pred_wf) / y_val_wf))
            mape_scores.append(mape)
            
            print(f"Trial {trial.number} - Finalizado ciclo {i + 1} de {len(sorted_periods) - train_window - val_window + 1} - MAPE: {mape:.3f} - TFE: {tfe:.3f}")

        print(f"MAPE avg: {np.mean(mape_scores):.3f}")
        print(f"TFE avg: {np.mean(tfe_scores):.3f}")

        return np.mean(tfe_scores)
    except Exception as e:
        print(f"Trial {trial.number} - Error: {e}")
        return np.inf


In [None]:
study_name = 'lightgbm_forecast_opt'
storage = 'sqlite:///optuna.db'

study = optuna.create_study(
    direction='minimize',
    study_name=study_name,
    storage=storage,
    load_if_exists=True
)

In [None]:
study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs, show_progress_bar=True)

print("Mejores hiperparámetros encontrados:")
print(study.best_params)

In [None]:
optuna.visualization.plot_optimization_history(study) 

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
X_test = df_test[features]
y_test = df_test['target']

In [None]:
best_params = study.best_trial.params.copy()
best_params

In [None]:
best_params = study.best_trial.params.copy()

best_params['device'] = 'gpu'
best_params['objective'] = 'regression'
best_params['boosting_type'] = 'gbdt'
best_params['metric'] = 'rmse'

train_window = best_params.pop("train_window")
val_window = 2
num_boost_round = best_params.pop("num_boost_round")
early_stopping_rounds = best_params.pop("early_stopping_rounds")

w1_alpha = best_params.pop("w1_alpha")
w2_alpha = best_params.pop("w2_alpha")

train_weights = (
    w1_alpha * df_train["w_rank"] +
    w2_alpha * df_train["w_tn"]
)

test_weights = (
    w1_alpha * df_test["w_rank"] +
    w2_alpha * df_test["w_tn"]
)

In [None]:
if train_weights.sum() == 0 or test_weights.sum() == 0:
    train_data_final = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols)
    test_data_final = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_cols)
else:
    train_data_final = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols, weight=train_weights)
    test_data_final = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_cols, weight=test_weights)

In [None]:
model = lgb.train(
    best_params,
    train_set=train_data_final,
    valid_sets=[test_data_final],
    num_boost_round=num_boost_round,
    callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=True)]
)

In [None]:
y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)

In [None]:
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
tfe_train = np.sum(np.abs(y_train - y_pred_train)) / np.sum(np.abs(y_train))
tfe_test = np.sum(np.abs(y_test - y_pred_test)) / np.sum(np.abs(y_test))

print(f'Total Forecast Error en entrenamiento: {tfe_train:.4f}')
print(f'Total Forecast Error en prueba: {tfe_test:.4f}')

In [None]:
y_pred_train = pd.DataFrame(y_pred_train, columns=['target_predicted'], index=X_train.index)
y_pred_test = pd.DataFrame(y_pred_test, columns=['target_predicted'], index=X_test.index)

In [None]:
y_pred_test

In [None]:
df_train.loc[df_train.index, 'target_predicted'] = y_pred_train['target_predicted']
df_test.loc[df_test.index, 'target_predicted'] = y_pred_test['target_predicted']

In [None]:
model.save_model('modelo_lgb.txt')
print("Modelo guardado exitosamente como 'modelo_lgb.txt'")

# Feature importance

In [None]:
feature_names = model.feature_name()

importance_gain = model.feature_importance(importance_type='gain')
importance_split = model.feature_importance(importance_type='split')

df_importance = pd.DataFrame({
    'feature': feature_names,
    'gain': importance_gain,
    'split': importance_split
}).sort_values(by='gain', ascending=False)

df_importance.head(50)

In [None]:
features_to_remove = df_importance[df_importance['gain'] == 0]['feature'].tolist()

print("Features a eliminar (gain = 0):")
for feature in features_to_remove:
    print(f"- {feature}")

In [None]:
df_importance.head(25)