# Modelado

Este script realiza la optimizaci√≥n de hiperpar√°metros usando Optuna para m√∫ltiples modelos y m√©tricas, selecciona el mejor modelo y lo guarda para producci√≥n.

In [None]:
# -*- coding: utf-8 -*-

import os
import warnings
import joblib
import optuna
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, recall_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE

# Importar configuraci√≥n y utilidades compartidas
from config import (
    CSV_ENTRENAR, CSV_VALIDAR, DIRECTORIO_SALIDA,
    RANDOM_STATE, THRESHOLDS, N_TRIALS_OPTUNA, MODELOS_DISPONIBLES,
    UMBRAL_CICLOS_DEFAULT, PESO_FALSOS_POSITIVOS, METRICAS, CONFIGURACION_MEJORES_MODELOS, RESULTADOS_MEJORES_MODELOS,
    configurar_logging
)
from utils import crear_derivadas, calcular_score_balanceado, evaluar_con_umbral

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Configurar logger
logger = configurar_logging(__name__)

## Carga de Variables

In [None]:
# ================================================================================
# CARGA DEL VARIABLES
# ================================================================================

# Mejor umbral de ciclos obtenido en preliminar.ipynb
UMBRAL_OPTIMO = UMBRAL_CICLOS_DEFAULT
    
# Modelos base a optimizar
MODELOS_BASE = MODELOS_DISPONIBLES

METRICA_PERSONALIZADA = METRICAS['score_propio']     
METRICAS_ESTANDAR = {k: v for k, v in METRICAS.items() if k != 'score_propio'}

2026-02-18 19:38:08 - __main__ - INFO - Umbral cargado desde modelo preliminar: 9
INFO:__main__:Umbral cargado desde modelo preliminar: 9


## Funciones 

In [None]:
# ================================================================================
# FUNCIONES DE OPTIMIZACI√ìN CON OPTUNA
# ================================================================================

def obtener_parametros_modelo(trial, model_name):
    """
    Obtiene los par√°metros a optimizar para cada modelo.

    Args:
        trial: Objeto trial de Optuna
        model_name: Nombre del modelo

    Returns:
        Tupla (params, model_class)
    """
    if model_name == 'XGBoost':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'scale_pos_weight': trial.suggest_int('scale_pos_weight', 5, 30),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'random_state': RANDOM_STATE,
            'eval_metric': 'aucpr',
            'n_jobs': -1
        }
        return params, XGBClassifier

    elif model_name == 'LightGBM':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'min_child_samples': trial.suggest_int('min_child_samples', 3, 20),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'class_weight': 'balanced',
            'random_state': RANDOM_STATE,
            'verbose': -1,
            'n_jobs': -1
        }
        return params, LGBMClassifier

    elif model_name == 'RandomForest':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 5, 15),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            'class_weight': 'balanced_subsample',
            'random_state': RANDOM_STATE,
            'n_jobs': -1
        }
        return params, RandomForestClassifier

    elif model_name == 'ExtraTrees':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 5, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            'class_weight': 'balanced_subsample',
            'random_state': RANDOM_STATE,
            'n_jobs': -1
        }
        return params, ExtraTreesClassifier

    elif model_name == 'HistGradientBoosting':
        params = {
            'max_iter': trial.suggest_int('max_iter', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 15, 63),
            'l2_regularization': trial.suggest_float('l2_regularization', 1e-6, 1.0, log=True),
            'class_weight': 'balanced',
            'random_state': RANDOM_STATE
        }
        return params, HistGradientBoostingClassifier

    elif model_name == 'CatBoost':
        params = {
            'iterations': trial.suggest_int('iterations', 100, 500),
            'depth': trial.suggest_int('depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
            'allow_writing_files': False,
            'auto_class_weights': 'Balanced',
            'random_state': RANDOM_STATE,
            'verbose': 0,
            'thread_count': -1
        }
        return params, CatBoostClassifier

def optimizar_modelo_balanceado(model_name, X_train_full, y_train_full, y_ciclos_full, umbral, cv_splits=5):
    """
    Crea funci√≥n objetivo para optimizar usando el score personalizado.
    
    Args:
        model_name: Nombre del modelo
        X_train_full: Features de entrenamiento
        y_train_full: Labels de entrenamiento
        y_ciclos_full: Ciclos de entrenamiento
        umbral: Umbral de ciclos
        cv_splits: N√∫mero de folds para CV
        
    Returns:
        Funci√≥n objetivo para Optuna
    """
    def objetivo(trial):
        params, model_class = obtener_parametros_modelo(trial, model_name)
        
        scores_cv = []
        cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=RANDOM_STATE)
        X_val_np = X_train_full.to_numpy() if isinstance(X_train_full, pd.DataFrame) else X_train_full
        y_val_np = y_train_full
        columnas = X_train_full.columns if isinstance(X_train_full, pd.DataFrame) else None

        for train_idx, val_idx in cv.split(X_train_full, y_train_full):
            X_fold_train = X_val_np[train_idx]
            y_fold_train = y_val_np[train_idx]
            X_fold_val = X_val_np[val_idx]
            y_fold_val = y_val_np[val_idx]
            y_fold_ciclos = y_ciclos_full[val_idx]

            # Aplicar SMOTE
            k_neighbors = min(3, y_fold_train.sum() - 1)
            smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=k_neighbors)

            try:
                X_fold_train_bal, y_fold_train_bal = smote.fit_resample(X_fold_train, y_fold_train)
            except ValueError:
                X_fold_train_bal, y_fold_train_bal = X_fold_train, y_fold_train

            model = model_class(**params)

            if columnas is not None:
                X_fold_train_bal = pd.DataFrame(X_fold_train_bal, columns=columnas)
                X_fold_val_df = pd.DataFrame(X_fold_val, columns=columnas)
            else:
                X_fold_val_df = X_fold_val

            model.fit(X_fold_train_bal, y_fold_train_bal)
            probas_val = model.predict_proba(X_fold_val_df)[:, 1]
            
            # Buscar mejor threshold
            mejor_score_fold = float('-inf')
            thresholds_opt = np.arange(0.1, 0.91, 0.1)

            for th in thresholds_opt:
                res = evaluar_con_umbral(y_fold_val, probas_val, y_fold_ciclos, umbral, th)
                if res:
                    sc = calcular_score_balanceado(res['pct_membranas'], res['FP'], res['membranas_total'], PESO_FALSOS_POSITIVOS)
                    if sc > mejor_score_fold:
                        mejor_score_fold = sc

            scores_cv.append(mejor_score_fold if mejor_score_fold > float('-inf') else -100)

        return np.mean(scores_cv)

    return objetivo


def optimizar_modelo_metricas(model_name, X_train_full, y_train_full, metric, cv_splits=5):
    """
    Crea funci√≥n objetivo para optimizar usando m√©tricas est√°ndar.
    
    Args:
        model_name: Nombre del modelo
        X_train_full: Features de entrenamiento
        y_train_full: Labels de entrenamiento
        metric: M√©trica a optimizar ('f1', 'recall', 'mcc')
        cv_splits: N√∫mero de folds para CV
        
    Returns:
        Funci√≥n objetivo para Optuna
    """
    def objetivo(trial):
        params, model_class = obtener_parametros_modelo(trial, model_name)
        
        scores_cv = []
        cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=RANDOM_STATE)
        X_val_np = X_train_full.to_numpy() if isinstance(X_train_full, pd.DataFrame) else X_train_full
        y_val_np = y_train_full
        columnas_f = X_train_full.columns if isinstance(X_train_full, pd.DataFrame) else None

        for train_idx, val_idx in cv.split(X_train_full, y_train_full):
            X_fold_train, y_fold_train = X_val_np[train_idx], y_val_np[train_idx]
            X_fold_val, y_fold_val = X_val_np[val_idx], y_val_np[val_idx]
            
            # Aplicar SMOTE
            k_n = min(3, y_fold_train.sum() - 1)
            smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=k_n)

            try:
                X_fold_train_bal, y_fold_train_bal = smote.fit_resample(X_fold_train, y_fold_train)
            except ValueError:
                X_fold_train_bal, y_fold_train_bal = X_fold_train, y_fold_train

            model = model_class(**params)

            if columnas_f is not None:
                X_fold_train_bal = pd.DataFrame(X_fold_train_bal, columns=columnas_f)
                X_fold_val_df = pd.DataFrame(X_fold_val, columns=columnas_f)
            else:
                X_fold_val_df = X_fold_val

            model.fit(X_fold_train_bal, y_fold_train_bal)

            y_pred = model.predict(X_fold_val_df)
            
            if metric == 'f1':
                score = f1_score(y_fold_val, y_pred)
            elif metric == 'recall':
                score = recall_score(y_fold_val, y_pred)
            elif metric == 'mcc':
                score = matthews_corrcoef(y_fold_val, y_pred)
            else:
                score = 0

            scores_cv.append(score)

        return np.mean(scores_cv)

    return objetivo

def crear_modelo(model_name, best_params):
    """
    Crea un modelo con los mejores par√°metros encontrados.

    Args:
        model_name: Nombre del modelo
        best_params: Diccionario con los mejores par√°metros

    Returns:
        Instancia del modelo configurado
    """
    if model_name == 'XGBoost':
        return XGBClassifier(**best_params)
    elif model_name == 'LightGBM':
        return LGBMClassifier(**best_params)
    elif model_name == 'RandomForest':
        return RandomForestClassifier(**best_params)
    elif model_name == 'ExtraTrees':
        return ExtraTreesClassifier(**best_params)
    elif model_name == 'HistGradientBoosting':
        return HistGradientBoostingClassifier(**best_params)
    elif model_name == 'CatBoost':
        return CatBoostClassifier(**best_params)

def correr_optimizacion(tipo, metrica_nombre):
    """
    Orquesta la optimizaci√≥n para todos los modelos.

    Args:
        tipo: 'balanceado' o 'estandar'
        metrica_nombre: Nombre de la m√©trica

    Returns:
        Diccionario con resultados por modelo
    """
    res_modelos = {}
    print(f"\nOptimizando para m√©trica: {metrica_nombre}")

    for nombre in MODELOS_BASE:
        print(f"  {nombre}...")

        if tipo == 'balanceado':
            objetivo = optimizar_modelo_balanceado(nombre, X_train_esc, y_train_umbral, y_train_ciclos, UMBRAL_OPTIMO, cv_splits=5)
        else:
            objetivo = optimizar_modelo_metricas(nombre, X_train_esc, y_train_umbral, metrica_nombre, cv_splits=5)

        estudio = optuna.create_study(
            direction='maximize',
            sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE)
        )
        estudio.optimize(objetivo, n_trials=N_TRIALS_OPTUNA, show_progress_bar=True)

        mejores_parametros = estudio.best_params
        mejor_score_cv = estudio.best_value
        mejor_modelo = crear_modelo(nombre, mejores_parametros)
        mejor_modelo.fit(X_train_bal_final, y_train_bal_final)

        res_modelos[nombre] = {
            'modelo': mejor_modelo,
            'parametros': mejores_parametros,
            'cv_score': mejor_score_cv
        }

        print(f"  Score: {mejor_score_cv:.4f}")

    return res_modelos

## Carga y Preparaci√≥n de Datos

In [9]:
# ================================================================================
# CARGA Y PREPARACI√ìN DE DATOS
# ================================================================================

logger.info("Cargando datos...")

df_train = pd.read_csv(CSV_ENTRENAR)
df_test = pd.read_csv(CSV_VALIDAR)

# Eliminar columna de √≠ndice
for df in [df_train, df_test]:
    if 'Indice' in df.columns:
        df.drop(columns=['Indice'], inplace=True)

# Aplicar ingenier√≠a de caracter√≠sticas
df_train_der = crear_derivadas(df_train)
df_test_der = crear_derivadas(df_test)

# Separar variables
columnas = [c for c in df_train_der.columns if c != 'Ciclos']

X_train = df_train_der[columnas]
X_test = df_test_der[columnas]
y_train_ciclos = df_train['Ciclos'].values
y_test_ciclos = df_test['Ciclos'].values

# Escalar variables (scaler solo de entrenamiento)
scaler = StandardScaler()
X_train_esc = pd.DataFrame(scaler.fit_transform(X_train), columns=columnas)
X_test_esc = pd.DataFrame(scaler.transform(X_test), columns=columnas)

# Aplicar umbral y SMOTE
y_train_umbral = (y_train_ciclos < UMBRAL_OPTIMO).astype(int)
y_test_umbral = (y_test_ciclos < UMBRAL_OPTIMO).astype(int)

k_neighbors = min(3, y_train_umbral.sum() - 1)
smote_final = SMOTE(random_state=RANDOM_STATE, k_neighbors=k_neighbors)

try:
    X_train_res_final, y_train_bal_final = smote_final.fit_resample(X_train_esc, y_train_umbral)
    X_train_bal_final = pd.DataFrame(X_train_res_final, columns=columnas)
except ValueError as e:
    logger.warning(f"SMOTE fall√≥: {e}")
    X_train_bal_final, y_train_bal_final = X_train_esc, y_train_umbral

logger.info(f"Datos cargados: {len(df_train)} train, {len(df_test)} test")

2026-02-18 19:38:36 - __main__ - INFO - Cargando datos...
INFO:__main__:Cargando datos...
2026-02-18 19:38:37 - __main__ - INFO - Datos cargados: 2055 train, 673 test
INFO:__main__:Datos cargados: 2055 train, 673 test


## Optimizaci√≥n de Hiperpar√°metros

In [None]:
# ================================================================================
# EJECUTAR OPTIMIZACI√ìN
# ================================================================================

logger.info("Iniciando exploraci√≥n de hiperpar√°metros...")
logger.info("="*50)

mejores_modelos = {}

# Optimizaci√≥n con m√©tricas est√°ndar (f1, recall, mcc)
for _, metrica in METRICAS_ESTANDAR.items():
    modelos = correr_optimizacion('estandar', metrica)
    mejores_modelos.update({f"{k}_{metrica}": v for k, v in modelos.items()})

# Optimizaci√≥n con score personalizado
modelos_pers = correr_optimizacion('balanceado', METRICA_PERSONALIZADA)
mejores_modelos.update({f"{k}_{METRICA_PERSONALIZADA}": v for k, v in modelos_pers.items()})


logger.info("="*50)
logger.info("\nExploraci√≥n terminada.")

2026-02-18 19:38:45 - __main__ - INFO - Iniciando exploraci√≥n de hiperpar√°metros...
INFO:__main__:Iniciando exploraci√≥n de hiperpar√°metros...



Optimizando para m√©trica: f1
  XGBoost... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4725
  LightGBM... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4530
  RandomForest... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4255
  HistGradientBoosting... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4519
  CatBoost... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4107
  ExtraTrees... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4501

Optimizando para m√©trica: recall
  XGBoost... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.9785
  LightGBM... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4653
  RandomForest... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4863
  HistGradientBoosting... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4757
  CatBoost... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4970
  ExtraTrees... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.5448

Optimizando para m√©trica: mcc
  XGBoost... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4254
  LightGBM... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.3999
  RandomForest... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.3655
  HistGradientBoosting... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.4047
  CatBoost... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.3824
  ExtraTrees... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 0.3974

Optimizando para m√©trica: Score Propio
  XGBoost... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 47.2105
  LightGBM... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 50.0650
  RandomForest... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 37.3167
  HistGradientBoosting... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 49.5552
  CatBoost... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 46.4912
  ExtraTrees... 

  0%|          | 0/50 [00:00<?, ?it/s]

Score: 40.7062

Exploraci√≥n terminada.


## Evaluaci√≥n de Resultados

In [None]:
# ================================================================================
# EVALUACI√ìN EN TEST Y REGISTRO DE RESULTADOS 
# ================================================================================
logger.info("Evaluando en dataset de test...")
logger.info("="*80)

print("Resultados en dataset de test:")

# Inicializaci√≥n de registros
registro_mejores_configuraciones = []
mejor_config_global = None
mejor_score_global = float('-inf')
mejor_nombre_global = None

# Bucle de evaluaci√≥n por modelo/m√©trica
for nombre, info in mejores_modelos.items():
    clf = info['modelo']
    y_proba = clf.predict_proba(X_test_esc)[:, 1]

    mejor_config_del_modelo = None
    mejor_score_del_modelo = float('-inf')

    # B√∫squeda del mejor threshold para este modelo espec√≠fico
    for th in THRESHOLDS:
        res = evaluar_con_umbral(y_test_umbral, y_proba, y_test_ciclos, UMBRAL_OPTIMO, th)

        if res:
            # C√°lculo del score balanceado usando la utilidad del script
            score = calcular_score_balanceado(
                res['pct_membranas'],
                res['FP'],
                res['membranas_total'],
                PESO_FALSOS_POSITIVOS
            )

            if score > mejor_score_del_modelo:
                mejor_score_del_modelo = score
                mejor_config_del_modelo = res.copy()
                mejor_config_del_modelo['score_balanceado'] = score
                mejor_config_del_modelo['modelo_nombre'] = nombre
                # Guardado de los par√°metros de Optuna para este modelo
                mejor_config_del_modelo['hiperparametros'] = str(info['parametros'])

    # Si el modelo produjo resultados v√°lidos, registrar
    if mejor_config_del_modelo:
        registro_mejores_configuraciones.append(mejor_config_del_modelo)

        print(f"{nombre:30} | th={mejor_config_del_modelo['threshold']:.2f} | "
              f"Membranas={mejor_config_del_modelo['membranas_detectadas']} | "
              f"FP={mejor_config_del_modelo['FP']} | Score={mejor_score_del_modelo:.2f}")

        # Seguimiento del mejor absoluto para entrenamiento final
        if mejor_score_del_modelo > mejor_score_global:
            mejor_score_global = mejor_score_del_modelo
            mejor_config_global = mejor_config_del_modelo
            mejor_nombre_global = nombre

# Exportaci√≥n de datos 
if registro_mejores_configuraciones:
    # Crear DataFrame y ordenar por desempe√±o
    df_historico = pd.DataFrame(registro_mejores_configuraciones)
    df_historico = df_historico.sort_values(by='score_balanceado', ascending=False)

    df_historico.to_csv(RESULTADOS_MEJORES_MODELOS, index=False)
    joblib.dump(registro_mejores_configuraciones, CONFIGURACION_MEJORES_MODELOS)

    print(f"Mejor modelo: {mejor_nombre_global} con Score de {mejor_score_global:.2f}")
    
    logger.info("="*80)
    logger.info("Evaluaci√≥n terminada.")
else:
    logger.info("No se generaron configuraciones v√°lidas.")

Resultados en conjunto de validaci√≥n:
XGBoost_f1                     | th=0.95 | Membranas=3 | FP=20 | Score=-20.00
LightGBM_f1                    | th=0.85 | Membranas=4 | FP=7 | Score=52.00
RandomForest_f1                | th=0.60 | Membranas=5 | FP=12 | Score=52.00
HistGradientBoosting_f1        | th=0.80 | Membranas=4 | FP=9 | Score=44.00
CatBoost_f1                    | th=0.60 | Membranas=5 | FP=12 | Score=52.00
ExtraTrees_f1                  | th=0.60 | Membranas=5 | FP=8 | Score=68.00
XGBoost_recall                 | th=0.95 | Membranas=4 | FP=144 | Score=-496.00
LightGBM_recall                | th=0.70 | Membranas=3 | FP=8 | Score=28.00
RandomForest_recall            | th=0.60 | Membranas=5 | FP=13 | Score=48.00
HistGradientBoosting_recall    | th=0.75 | Membranas=2 | FP=9 | Score=4.00
CatBoost_recall                | th=0.75 | Membranas=2 | FP=2 | Score=32.00
ExtraTrees_recall              | th=0.65 | Membranas=3 | FP=5 | Score=40.00
XGBoost_mcc                    | th=0.95 

## Entrenamiento del Modelo Final

In [None]:
# ================================================================================
# ENTRENAMIENTO DE UN MODELO ESPEC√çFICO CON TODOS LOS DATOS 
# ================================================================================

NOMBRE_MODELO_ELEGIDO = "RandomForest_mcc"

if NOMBRE_MODELO_ELEGIDO in mejores_modelos:
    info_modelo = mejores_modelos[NOMBRE_MODELO_ELEGIDO]
    params_elegidos = info_modelo['parametros']
    nombre_base = NOMBRE_MODELO_ELEGIDO.split('_')[0]

    logger.info(f"Entrenando modelo seleccionado: {NOMBRE_MODELO_ELEGIDO}")

    # Preparaci√≥n de datos completos
    X_full = pd.concat([X_train, X_test], axis=0).reset_index(drop=True)
    y_full_ciclos = np.concatenate([y_train_ciclos, y_test_ciclos])
    y_full_umbral = (y_full_ciclos < UMBRAL_OPTIMO).astype(int)
    X_full_esc = pd.DataFrame(scaler.transform(X_full), columns=columnas)

    # Balanceo con SMOTE
    k_neighbors = min(3, y_full_umbral.sum() - 1)
    smote_full = SMOTE(random_state=RANDOM_STATE, k_neighbors=k_neighbors)
    try:
        X_full_bal, y_full_bal = smote_full.fit_resample(X_full_esc, y_full_umbral)
        X_full_bal = pd.DataFrame(X_full_bal, columns=columnas)
    except ValueError as e:
        X_full_bal, y_full_bal = X_full_esc, y_full_umbral

    # Crear y entrenar el modelo espec√≠fico
    modelo_seleccionado = crear_modelo(nombre_base, params_elegidos)
    modelo_seleccionado.fit(X_full_bal, y_full_bal)

    logger.info("Modelo entrenado.")

else:
    logger.warning(f"Error: El modelo '{NOMBRE_MODELO_ELEGIDO}' no existe en mejores_modelos.")
    print(f"Opciones disponibles: {list(mejores_modelos.keys())}")

Entrenando modelo seleccionado: RandomForest_mcc

‚úÖ ¬°Listo! Modelo 'RandomForest_mcc' entrenado y guardado.
üìç Ruta: /content/drive/MyDrive/Colab Notebooks/output/modelo_RandomForest_mcc.pkl
üéØ Threshold configurado: 0.55


## Guardado del Modelo

In [None]:
# Buscar el threshold √≥ptimo para este modelo en el registro
config_aux = df_historico[df_historico['modelo_nombre'] == NOMBRE_MODELO_ELEGIDO].iloc[0]
threshold_especifico = config_aux['threshold']

# Guardar modelo 
RUTA_SALIDA_ESPECIFICA = os.path.join(DIRECTORIO_SALIDA, f"modelo_{NOMBRE_MODELO_ELEGIDO}.pkl")

config_produccion_especifica = {
    'umbral_ciclos': UMBRAL_OPTIMO,
    'threshold': threshold_especifico,
    'scaler': scaler,
    'modelo': modelo_seleccionado,
    'feature_cols': columnas,
    'nombre_modelo': nombre_base  
}

joblib.dump(config_produccion_especifica, RUTA_SALIDA_ESPECIFICA)

logger.info(f"Modelo guardado en: {RUTA_SALIDA_ESPECIFICA}")
logger.info("\n")
logger.info("MODELADO COMPLETADO")