# Entrenamiento Preliminar de Modelos

Este script realiza la b칰squeda del mejor umbral de ciclos y modelo base para la predicci칩n de roturas de membrana.

In [None]:
# -*- coding: utf-8 -*-

import os
import warnings
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE

# Importar configuraci칩n y utilidades compartidas
from config import (
    CSV_ENTRENAR, CSV_VALIDAR, MODELO_PRELIMINAR,
    RANDOM_STATE, UMBRALES_CICLOS, THRESHOLDS, MODELOS_DISPONIBLES,
    PESO_FALSOS_POSITIVOS, configurar_logging
)
from utils import crear_derivadas, analizar_membranas, calcular_score_balanceado, evaluar_con_umbral

warnings.filterwarnings('ignore')

# Configurar logger
logger = configurar_logging(__name__)

## Configuraci칩n Local

In [None]:
# ================================================================================
# CONFIGURACI칍N LOCAL (sobreescribe config.py si es necesario)
# ================================================================================

# Modelos a evaluar (subconjunto de MODELOS_DISPONIBLES si se desea)
MODELOS_ACTIVOS = MODELOS_DISPONIBLES  # Usar todos los disponibles

## Funciones de Modelos

In [None]:
# ================================================================================
# FUNCIONES DE MODELOS
# ================================================================================

def obtener_modelos():
    """
    Retorna un diccionario con los modelos configurados.
    
    Returns:
        Diccionario {nombre: instancia_modelo}
    """
    modelos = {
        'XGBoost': XGBClassifier(
            n_estimators=200, max_depth=5, learning_rate=0.05,
            scale_pos_weight=15, random_state=RANDOM_STATE, 
            eval_metric='aucpr', n_jobs=-1
        ),
        'LightGBM': LGBMClassifier(
            n_estimators=200, max_depth=5, learning_rate=0.05,
            class_weight='balanced', random_state=RANDOM_STATE, 
            verbose=-1, n_jobs=-1
        ),
        'RandomForest': RandomForestClassifier(
            n_estimators=200, max_depth=8,
            class_weight='balanced_subsample', 
            random_state=RANDOM_STATE, n_jobs=-1
        ),
        'ExtraTrees': ExtraTreesClassifier(
            n_estimators=200, max_depth=10,
            class_weight='balanced_subsample', 
            random_state=RANDOM_STATE, n_jobs=-1
        ),
        'HistGradientBoosting': HistGradientBoostingClassifier(
            max_iter=200, max_depth=5, learning_rate=0.05,
            class_weight='balanced', random_state=RANDOM_STATE
        ),
        'CatBoost': CatBoostClassifier(
            iterations=200, depth=5, learning_rate=0.05,
            auto_class_weights='Balanced', random_state=RANDOM_STATE, 
            verbose=0, allow_writing_files=False
        )
    }
    
    # Filtrar solo modelos activos
    return {k: v for k, v in modelos.items() if k in MODELOS_ACTIVOS}

## Carga de Datos

In [None]:
# ================================================================================
# CARGA DE DATOS
# ================================================================================

logger.info("Cargando datos...")

df_train = pd.read_csv(CSV_ENTRENAR)
df_test = pd.read_csv(CSV_VALIDAR)

# Eliminar columna de 칤ndice si existe
for df in [df_train, df_test]:
    if 'Indice' in df.columns:
        df.drop(columns=['Indice'], inplace=True)

logger.info(f"Datos cargados: {len(df_train)} train, {len(df_test)} test")

## Ingenier칤a de Caracter칤sticas y Escalado

In [None]:
# ================================================================================
# INGENIER칈A DE CARACTER칈STICAS
# ================================================================================

logger.info("Aplicando ingenier칤a de caracter칤sticas...")

df_train_der = crear_derivadas(df_train)
df_test_der = crear_derivadas(df_test)

# Separar variables de objetivo
columnas = [c for c in df_train_der.columns if c != 'Ciclos']

X_train = df_train_der[columnas]
X_test = df_test_der[columnas]
y_train_ciclos = df_train['Ciclos'].values
y_test_ciclos = df_test['Ciclos'].values

# Escalado
logger.info("Escalando variables...")

scaler = StandardScaler()
X_train_esc = pd.DataFrame(scaler.fit_transform(X_train), columns=columnas)
X_test_esc = pd.DataFrame(scaler.transform(X_test), columns=columnas)

logger.info("Preprocesamiento completado.")

## B칰squeda del Mejor Umbral de Ciclos

In [None]:
# ================================================================================
# B칔SQUEDA DEL MEJOR UMBRAL DE CICLOS
# ================================================================================

logger.info("Buscando mejor umbral de ciclos...")
print("="*50)

mejores_por_umbral = {}
modelos = obtener_modelos()

for umbral in UMBRALES_CICLOS:
    y_train_umbral = (y_train_ciclos < umbral).astype(int)
    y_test_umbral = (y_test_ciclos < umbral).astype(int)
    
    # Aplicar SMOTE
    k_neighbors = min(3, y_train_umbral.sum() - 1)
    smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=k_neighbors)
    
    try:
        X_train_res, y_train_bal = smote.fit_resample(X_train_esc, y_train_umbral)
        X_train_bal = pd.DataFrame(X_train_res, columns=columnas)
    except ValueError as e:
        logger.warning(f"SMOTE fall칩 para umbral {umbral}: {e}")
        X_train_bal = X_train_esc
        y_train_bal = y_train_umbral
    
    mejor_umbral = {'score': float('-inf')}
    
    for nombre, clf in modelos.items():
        clf.fit(X_train_bal, y_train_bal)
        y_proba = clf.predict_proba(X_test_esc)[:, 1]
        
        for th in THRESHOLDS:
            result = evaluar_con_umbral(y_test_umbral, y_proba, y_test_ciclos, umbral, th)
            if result:
                score = calcular_score_balanceado(
                    result['pct_membranas'],
                    result['FP'],
                    result['membranas_total'],
                    PESO_FALSOS_POSITIVOS
                )
                if score > mejor_umbral['score']:
                    mejor_umbral = {
                        'umbral': umbral,
                        'modelo': nombre,
                        'threshold': th,
                        'score': score,
                        'pct_membranas': result['pct_membranas'],
                        'fp': result['FP'],
                        'resultado': result
                    }
    
    if mejor_umbral['score'] > float('-inf'):
        mejores_por_umbral[umbral] = mejor_umbral
        print(f"Umbral {umbral}: {mejor_umbral['modelo']} (th={mejor_umbral['threshold']:.2f}) | "
              f"Membranas={mejor_umbral['resultado']['membranas_detectadas']}/{mejor_umbral['resultado']['membranas_total']} | "
              f"FP={mejor_umbral['fp']} | Score={mejor_umbral['score']:.2f}")

## Selecci칩n y Guardado del Mejor Modelo

In [None]:
# ================================================================================
# SELECCI칍N Y GUARDADO DEL MEJOR MODELO
# ================================================================================

if not mejores_por_umbral:
    raise ValueError("No se encontr칩 ning칰n umbral v치lido durante la exploraci칩n inicial.")

UMBRAL_OPTIMO = max(mejores_por_umbral.keys(), key=lambda x: mejores_por_umbral[x]['score'])
ganador = mejores_por_umbral[UMBRAL_OPTIMO]
MEJOR_MODELO_NOMBRE = ganador['modelo']

print("="*50)
print(f"游끥 Mejor umbral: {UMBRAL_OPTIMO} ciclos (Score={ganador['score']:.2f})")
print(f"Re-entrenando y guardando ganador: {MEJOR_MODELO_NOMBRE}...")
print("="*50)

# Preparar datos finales con el mejor umbral
y_train_best = (y_train_ciclos < UMBRAL_OPTIMO).astype(int)

# SMOTE de los datos finales
k_v = min(3, y_train_best.sum() - 1)
smote_g = SMOTE(random_state=RANDOM_STATE, k_neighbors=k_v)

try:
    X_tr_g, y_tr_g = smote_g.fit_resample(X_train_esc, y_train_best)
    X_tr_g = pd.DataFrame(X_tr_g, columns=columnas)
except ValueError as e:
    logger.warning(f"SMOTE fall칩 en reentrenamiento: {e}")
    X_tr_g, y_tr_g = X_train_esc, y_train_best

# Re-entrenar modelo ganador
modelo_final = obtener_modelos()[MEJOR_MODELO_NOMBRE]
modelo_final.fit(X_tr_g, y_tr_g)

In [None]:
# ================================================================================
# GUARDADO DEL ARTEFACTO
# ================================================================================

artifact = {
    'umbral_ciclos': UMBRAL_OPTIMO, 
    'threshold': ganador['threshold'],  # Clave consistente (singular)
    'scaler': scaler,        
    'modelo': modelo_final,
    'feature_cols': list(X_tr_g.columns),
    'nombre_modelo': MEJOR_MODELO_NOMBRE
}

# Crear directorio de salida si no existe
os.makedirs(os.path.dirname(MODELO_PRELIMINAR), exist_ok=True)

# Guardar modelo
joblib.dump(artifact, MODELO_PRELIMINAR)

logger.info(f"Modelo guardado exitosamente en: {MODELO_PRELIMINAR}")
print("\n" + "="*50)
print("ENTRENAMIENTO COMPLETADO")
print("="*50)