# Modelado TIER 2: LightGBM

Este notebook se enfoca en entrenar y evaluar un modelo LightGBM.

## 1. Carga de Librerías y Datos

In [None]:
import pandas as pd
import numpy as np
import pickle
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Rutas de los datos preprocesados
X_train_path = '../data/processed/X_train.parquet'
y_train_path = '../data/processed/y_train.parquet'
X_val_path = '../data/processed/X_val.parquet'
y_val_path = '../data/processed/y_val.parquet'
X_test_path = '../data/processed/X_test.parquet'
feature_info_path = '../data/processed/feature_info.pkl'

# Cargar los datos
X_train = pd.read_parquet(X_train_path)
y_train = pd.read_parquet(y_train_path).squeeze()
X_val = pd.read_parquet(X_val_path)
y_val = pd.read_parquet(y_val_path).squeeze()
X_test = pd.read_parquet(X_test_path)

# Cargar información de las características
with open(feature_info_path, 'rb') as f:
    feature_info = pickle.load(f)

print("Forma de X_train:", X_train.shape)
print("Forma de y_train:", y_train.shape)
print("Forma de X_val:", X_val.shape)
print("Forma de y_val:", y_val.shape)
print("Forma de X_test:", X_test.shape)
print("Características seleccionadas:", feature_info['selected_features'])

## 2. Definición y Entrenamiento del Modelo LightGBM

In [None]:
# Crear datasets para LightGBM
lgb_train = lgb.Dataset(X_train[feature_info['selected_features']], y_train)
lgb_eval = lgb.Dataset(X_val[feature_info['selected_features']], y_val, reference=lgb_train)

# Parámetros de LightGBM (puedes ajustarlos o usar Optuna/GridSearch para optimizarlos)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42,
    'is_unbalance': True # Similar a class_weight='balanced'
}

print("Entrenando el modelo LightGBM...")
lgbm_model = lgb.train(params,
                       lgb_train,
                       num_boost_round=1000, # Número de iteraciones de boosting
                       valid_sets=[lgb_train, lgb_eval],
                       callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)])
print("Entrenamiento completado.")

## 3. Evaluación del Modelo

In [None]:
# Predicciones en el conjunto de validación
y_pred_proba_val = lgbm_model.predict(X_val[feature_info['selected_features']], num_iteration=lgbm_model.best_iteration)
y_pred_val = (y_pred_proba_val > 0.5).astype(int) # Umbral de 0.5 para clasificación binaria

# Calcular AUC-ROC
auc_roc = roc_auc_score(y_val, y_pred_proba_val)
print(f"AUC-ROC en el conjunto de validación: {auc_roc:.4f}")

# Mostrar reporte de clasificación
print("\nReporte de Clasificación en el conjunto de validación:")
print(classification_report(y_val, y_pred_val))

# Mostrar matriz de confusión
print("\nMatriz de Confusión en el conjunto de validación:")
cm = confusion_matrix(y_val, y_pred_val)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicción')
plt.ylabel('Real')
plt.title('Matriz de Confusión - LightGBM')
plt.show()

## 4. Importancia de las Características

In [None]:
lgb.plot_importance(lgbm_model, max_num_features=20, figsize=(10,8))
plt.title('Importancia de las Características - LightGBM')
plt.show()

# Obtener y mostrar como DataFrame
feature_importances = lgbm_model.feature_importance(importance_type='gain') # 'split' o 'gain'
feature_names = X_train[feature_info['selected_features']].columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
print(feature_importance_df)

## 5. Guardar el Modelo (Opcional)

In [None]:
# Guardar el modelo entrenado
model_path = '../models/lightgbm_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(lgbm_model, f)
print(f"Modelo LightGBM guardado en: {model_path}")

# También podrías guardar las predicciones si es necesario para una submission
# y_pred_test_proba = lgbm_model.predict(X_test[feature_info['selected_features']], num_iteration=lgbm_model.best_iteration)
# submission_df = pd.read_csv('../data/sample_submission.csv')
# submission_df['Exited'] = y_pred_test_proba
# submission_df.to_csv('../submissions/submission_lgbm.csv', index=False)
# print("Archivo de submission generado: ../submissions/submission_lgbm.csv")

In [None]:
# Guardar el modelo entrenado
model_path = '../models/lightgbm_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(lgbm_model, f)
print(f"Modelo LightGBM guardado en: {model_path}")

# También podrías guardar las predicciones si es necesario para una submission
# y_pred_test_proba = lgbm_model.predict(X_test[feature_info['selected_features']], num_iteration=lgbm_model.best_iteration)
# submission_df = pd.read_csv('../data/sample_submission.csv')
# submission_df['Exited'] = y_pred_test_proba
# submission_df.to_csv('../submissions/submission_lgbm.csv', index=False)
# print("Archivo de submission generado: ../submissions/submission_lgbm.csv")