In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import seaborn as sns
import warnings
import os
import psutil
from datetime import datetime

warnings.filterwarnings('ignore')

# 📅 Obtener la fecha y hora de inicio del proceso
start_time = datetime.now()
timestamp = start_time.strftime("%Y%m%d_%H%M%S")

# 📊 Archivos con timestamp
memory_log_file = f"memory_log_{timestamp}.txt"
pdf_file = f"reporte_modelos_{timestamp}.pdf"

# 🧠 Función para registrar el consumo de memoria
process = psutil.Process(os.getpid())
def log_memory_usage(stage):
    memory_info = process.memory_info().rss / (1024 ** 2)  # Convertir a MB
    with open(memory_log_file, "a") as log_file:
        log_file.write(f"{datetime.now()} - {stage}: {memory_info:.2f} MB\n")

log_memory_usage("Inicio del Proceso")

# 📥 Cargar datos
parquet_file = r"C:\Users\Gonzalo\Downloads\df_triage_encoded.parquet"
df = pd.read_parquet(parquet_file)
target_column = 'nivel_triage'

# 🚀 Preprocesamiento
X = df.drop(columns=[target_column])
y = df[target_column] - 1  # Ajustar etiquetas para XGBoost

imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, train_size=0.75, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

undersampler = RandomUnderSampler(random_state=42)
X_train_balanced, y_train_balanced = undersampler.fit_resample(X_train_scaled, y_train)

log_memory_usage("Después del Preprocesamiento")

# 🔍 Modelos para GridSearchCV
models = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {'n_estimators': [50, 100], 'max_depth': [10, 20, None]}
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
        'params': {'n_estimators': [50, 100], 'max_depth': [3, 6]}
    },
    'LightGBM': {
        'model': LGBMClassifier(random_state=42),
        'params': {'n_estimators': [50, 100], 'num_leaves': [31, 50]}
    }
}

# 📈 Resultados
results = {}

# 🚀 Entrenamiento y Evaluación
for name, config in models.items():
    print(f"Entrenando {name}...")
    grid = GridSearchCV(config['model'], config['params'], cv=3, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train_balanced, y_train_balanced)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    y_pred_proba = best_model.predict_proba(X_test_scaled)

    report = classification_report(y_test, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # 🎯 Curva ROC
    auc_scores = {}
    plt.figure(figsize=(8, 6))
    for i in range(len(np.unique(y_test))):
        fpr, tpr, _ = roc_curve((y_test == i).astype(int), y_pred_proba[:, i])
        auc_value = auc(fpr, tpr)
        auc_scores[f"Clase {i+1}"] = auc_value
        plt.plot(fpr, tpr, label=f'Clase {i+1} (AUC = {auc_value:.2f})')

    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel('Tasa de Falsos Positivos (FPR)')
    plt.ylabel('Tasa de Verdaderos Positivos (TPR)')
    plt.title(f'Curva ROC - {name}')
    plt.legend()

    roc_plot_path = f"roc_curve_{name}_{timestamp}.png"
    plt.savefig(roc_plot_path)
    plt.close()

    # 🔷 Matriz de Confusión
    plt.figure(figsize=(5, 4))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Matriz de Confusión - {name}')
    plt.xlabel('Predicho')
    plt.ylabel('Real')

    conf_matrix_path = f"conf_matrix_{name}_{timestamp}.png"
    plt.savefig(conf_matrix_path)
    plt.close()

    results[name] = {
        'best_params': grid.best_params_,
        'classification_report': report,
        'conf_matrix': conf_matrix,
        'auc_scores': auc_scores,
        'roc_curve_path': roc_plot_path,
        'conf_matrix_path': conf_matrix_path
    }

    log_memory_usage(f"Después del Entrenamiento de {name}")

# 🏆 Mejor Modelo según AUC
best_model_by_auc = max(results.items(), key=lambda x: np.mean(list(x[1]['auc_scores'].values())))

# ⏱️ Finalización
end_time = datetime.now()
total_duration = (end_time - start_time).total_seconds() / 60

# 📝 Generar PDF
c = canvas.Canvas(pdf_file, pagesize=letter)
c.setFont("Helvetica", 12)
c.drawString(100, 750, "Informe de Comparación de Modelos - Detallado")

# ⏱️ Tiempos de Ejecución
c.drawString(100, 730, f"Hora de Inicio: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
c.drawString(100, 710, f"Hora de Fin: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
c.drawString(100, 690, f"Duración Total: {total_duration:.2f} minutos")

# 📊 Resultados por Modelo
y_position = 660
for model_name, result in results.items():
    c.setFont("Helvetica-Bold", 11)
    c.drawString(100, y_position, f"Modelo: {model_name}")
    y_position -= 20

    c.setFont("Helvetica", 10)
    c.drawString(100, y_position, f"Mejores Parámetros: {result['best_params']}")
    y_position -= 20

    avg_auc = np.mean(list(result['auc_scores'].values()))
    c.drawString(100, y_position, f"Promedio AUC: {avg_auc:.2f}")
    y_position -= 20

    for cls, metrics in result['classification_report'].items():
        if isinstance(metrics, dict):
            c.drawString(100, y_position, f"Clase {cls}: Precisión={metrics['precision']:.2f}, Recall={metrics['recall']:.2f}, F1-Score={metrics['f1-score']:.2f}")
            y_position -= 20

    # 📊 Matriz de Confusión y Curva ROC
    c.drawImage(result['conf_matrix_path'], 100, y_position - 150, width=300, height=150)
    y_position -= 170

    c.drawImage(result['roc_curve_path'], 100, y_position - 200, width=300, height=200)
    y_position -= 220

    if y_position < 150:
        c.showPage()
        y_position = 750

    # 🗑️ Eliminar Imágenes Temporales
    os.remove(result['conf_matrix_path'])
    os.remove(result['roc_curve_path'])

# 🏆 Mejor Modelo
c.setFont("Helvetica-Bold", 12)
c.drawString(100, y_position, f"Mejor Modelo por AUC: {best_model_by_auc[0]}")
c.drawString(100, y_position - 20, f"Parámetros del Mejor Modelo: {best_model_by_auc[1]['best_params']}")
c.drawString(100, y_position - 40, f"AUC Promedio: {np.mean(list(best_model_by_auc[1]['auc_scores'].values())):.2f}")

c.save()

log_memory_usage("Fin del Proceso")

# 📦 Resultados Finales
print(f"Informe guardado como: {pdf_file}")
print(f"Consumo de memoria registrado en: {memory_log_file}")


Entrenando RandomForest...
Entrenando XGBoost...
Entrenando LightGBM...


  File "c:\Users\Gonzalo\Documents\GitHub\00_tesisaustral\tesisaustral\tesisaustral\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.221803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9839
[LightGBM] [Info] Number of data points in the train set: 277330, number of used features: 550
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
Informe guardado como: reporte_modelos_completo.pdf
Consumo de memoria registrado en: memory_log.txt
