In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psutil
import pyarrow.parquet as pq
from datetime import datetime
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_curve, auc, log_loss,
    accuracy_score, f1_score, precision_score, recall_score
)
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from collections import defaultdict
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')

def train_and_evaluate_model(X_train_path, y_train_path, X_test_path, y_test_path):
    """Entrena y evalúa el modelo con mejoras para optimizar el aprendizaje"""
    start_time = datetime.now()
    process = psutil.Process(os.getpid())
    mem_before = process.memory_info().rss / (1024 ** 3)
    
    try:
        # 1. Cálculo eficiente de medianas en batches (para imputación de NaN si es necesario)
        print("Calculando medianas...")
        median_values = None
        total_rows = 0
        parquet_file = pq.ParquetFile(X_train_path)
        
        for batch in parquet_file.iter_batches(batch_size=1000):
            chunk = batch.to_pandas().astype('float32')
            if median_values is None:
                median_values = chunk.median() * len(chunk)
                total_rows = len(chunk)
            else:
                median_values += chunk.median() * len(chunk)
                total_rows += len(chunk)
        
        median_values /= total_rows

        # 2. Cálculo mejorado de pesos de clase
        print("Calculando pesos de clase...")
        class_counts = defaultdict(int)
        y_parquet = pq.ParquetFile(y_train_path)
        
        for batch in y_parquet.iter_batches(batch_size=1000):
            chunk = batch.to_pandas().squeeze()
            counts = chunk.value_counts()
            for cls, count in counts.items():
                class_counts[cls] += count
        
        total_samples = sum(class_counts.values())
        class_weights = {cls: total_samples / (len(class_counts) * count * 0.5)  # Ajuste para clases minoritarias
                        for cls, count in class_counts.items()}

        # 3. Cargar y procesar datos de test
        X_test = pq.read_table(X_test_path).to_pandas().astype('float32').fillna(median_values)
        y_test = pq.read_table(y_test_path).to_pandas().squeeze()

        # 4. Verificar que las columnas de entrenamiento y prueba coincidan
        X_sample = pq.read_table(X_train_path).to_pandas().sample(n=5000, random_state=42)
        y_sample = pq.read_table(y_train_path).to_pandas().squeeze().loc[X_sample.index]

        # Asegurarse de que X_test tenga las mismas columnas que X_sample
        missing_columns = set(X_sample.columns) - set(X_test.columns)
        if missing_columns:
            print(f"Advertencia: Columnas faltantes en X_test: {missing_columns}. Rellenando con 0.")
            for col in missing_columns:
                X_test[col] = 0  # Rellenar con 0 o NaN según sea necesario

        # 5. Selección de características
        print("Selección inicial de características...")
        selector = SelectKBest(f_classif, k=min(100, X_sample.shape[1]))
        selector.fit(X_sample.fillna(median_values), y_sample)
        selected_features = selector.get_support(indices=True)

        # 6. Configuración optimizada del modelo
        model = SGDClassifier(
            loss='log_loss',
            penalty='elasticnet',
            alpha=0.0001,          # Reducción de regularización
            l1_ratio=0.3,          # Balance L1/L2
            learning_rate='adaptive', # Tasa de aprendizaje adaptable
            eta0=0.1,              # Tasa de aprendizaje inicial
            power_t=0.2,
            class_weight=class_weights,
            max_iter=1000,
            tol=1e-5,
            random_state=42,
            warm_start=True
        )

        # 7. Entrenamiento con monitoreo mejorado
        mem_tracking = []
        best_loss = float('inf')
        patience = 5
        no_improvement = 0
        batch_size = 1000
        smote = SMOTE(sampling_strategy='auto', k_neighbors=3, random_state=42)
        
        # 8. Ciclo de entrenamiento mejorado
        for epoch in range(50):  # Máximo de épocas reducido
            epoch_loss = 0.0
            batch_count = 0
            X_file = pq.ParquetFile(X_train_path)
            y_file = pq.ParquetFile(y_train_path)
            
            for X_batch, y_batch in zip(X_file.iter_batches(), y_file.iter_batches()):
                # Procesamiento del batch
                X_train = X_batch.to_pandas().astype('float32').fillna(median_values).iloc[:, selected_features]
                y_train = y_batch.to_pandas().squeeze()
                
                # Aplicar SMOTE en cada batch
                try:
                    X_res, y_res = smote.fit_resample(X_train, y_train)
                except ValueError:
                    X_res, y_res = X_train, y_train
                
                # Entrenamiento y cálculo de métricas
                model.partial_fit(X_res, y_res, classes=np.unique(y_sample))
                y_proba_train = model.predict_proba(X_res)
                batch_loss = log_loss(y_res, y_proba_train)
                
                epoch_loss += batch_loss
                batch_count += 1
                mem_tracking.append(process.memory_info().rss / (1024 ** 3))
            
            # Cálculo de métricas de evaluación
            y_pred_test = model.predict(X_test.iloc[:, selected_features])
            test_acc = accuracy_score(y_test, y_pred_test)
            test_f1 = f1_score(y_test, y_pred_test, average='macro')
            test_precision = precision_score(y_test, y_pred_test, average='macro')
            test_recall = recall_score(y_test, y_pred_test, average='macro')
            
            avg_epoch_loss = epoch_loss / batch_count
            print(f"Época {epoch+1} - Loss: {avg_epoch_loss:.4f} | Test Acc: {test_acc:.2f} | Test F1: {test_f1:.2f}")
            
            # Early stopping con mejora en métricas de test
            if test_f1 > best_loss:
                best_loss = test_f1
                no_improvement = 0
            else:
                no_improvement += 1
                
            if no_improvement >= patience:
                print(f"✅ Mejor modelo en época {epoch+1} - F1: {test_f1:.2f}")
                break

        # 9. Evaluación final
        y_pred = model.predict(X_test.iloc[:, selected_features])
        y_proba = model.predict_proba(X_test.iloc[:, selected_features])
        
        # Verificar que y_proba no contenga NaN o infinitos
        if np.isnan(y_proba).any() or np.isinf(y_proba).any():
            raise ValueError("Las probabilidades predichas contienen NaN o infinitos. Revisa el entrenamiento del modelo.")
        
        # Verificar la distribución de clases en el conjunto de prueba
        class_distribution_test = y_test.value_counts()
        print("Distribución de clases en el conjunto de prueba:", class_distribution_test.to_dict())
        
        # Filtrar clases con menos de 2 muestras
        valid_classes = class_distribution_test[class_distribution_test >= 2].index
        
        # Calcular la matriz de confusión
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        # Generar curvas ROC para todas las clases
        plt.figure(figsize=(10, 8))
        auc_scores = {}
        for i in valid_classes:
            try:
                # Ajustar el índice para coincidir con la columna de y_proba
                col_idx = i - 1  # Restar 1 para convertir de índice 1-5 a índice 0-4
                fpr, tpr, _ = roc_curve((y_test == i).astype(int), y_proba[:, col_idx])
                auc_value = auc(fpr, tpr)
                auc_scores[f"Clase {i}"] = auc_value
                plt.plot(fpr, tpr, label=f'Clase {i} (AUC = {auc_value:.2f})')
            except ValueError as e:
                print(f"Error calculando AUC para la clase {i}: {str(e)}")
                auc_scores[f"Clase {i}"] = np.nan

        # Verificar si se pudo calcular al menos un AUC válido
        if not auc_scores or all(np.isnan(value) for value in auc_scores.values()):
            print("No se pudo calcular el AUC para ninguna clase. Generando gráfico vacío.")
            plt.text(0.5, 0.5, 'No se pudo calcular el AUC para ninguna clase', ha='center', va='center')
        
        plt.title('Curvas ROC')
        plt.xlabel('Tasa de Falsos Positivos')
        plt.ylabel('Tasa de Verdaderos Positivos')
        plt.legend()
        roc_plot_path = os.path.join("graficos", f"roc_curve_{os.path.basename(X_train_path)}.png")
        plt.savefig(roc_plot_path)
        plt.close()

        # Generar matriz de confusión detallada
        plt.figure(figsize=(10, 8))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
        plt.title('Matriz de Confusión')
        conf_matrix_path = os.path.join("graficos", f"conf_matrix_{os.path.basename(X_train_path)}.png")
        plt.savefig(conf_matrix_path)
        plt.close()

        # Resultados finales
        end_time = datetime.now()
        return {
            'model_name': 'SGDClassifier-Optimizado',
            'dataset': os.path.basename(X_train_path),
            'start_time': start_time,
            'end_time': end_time,
            'duration': (end_time - start_time).total_seconds() / 60,
            'avg_memory': np.mean(mem_tracking),
            'test_accuracy': test_acc,
            'test_precision': test_precision,
            'test_recall': test_recall,
            'test_f1': test_f1,
            'classification_report': classification_report(y_test, y_pred, output_dict=True, zero_division=0),
            'conf_matrix_path': conf_matrix_path,
            'roc_curve_path': roc_plot_path,
            'auc_scores': auc_scores,
            'selected_features': len(selected_features)
        }

    except Exception as e:
        print(f"Error procesando {X_train_path}: {str(e)}")
        return None
    
def generate_pdf_report(results, output_path):
    """Genera reporte PDF completo con análisis comparativo"""
    csv_file = os.path.join(output_path, f"metricas_completas_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
    pdf_file = os.path.join(output_path, f"reporte_completo_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf")
    
    # Preparar datos para CSV
    csv_data = []
    best_models = {
        'global': {'score': -1, 'model': None},
        'class1_recall': {'score': -1, 'model': None},
        'fastest': {'score': float('inf'), 'model': None},
        'memory': {'score': float('inf'), 'model': None}
    }
    
    for result in results:
        if not result:
            continue
        
        # Datos para CSV
        model_data = {
            'dataset': result['dataset'],
            'inicio': result['start_time'].strftime('%Y-%m-%d %H:%M:%S'),
            'fin': result['end_time'].strftime('%Y-%m-%d %H:%M:%S'),
            'duracion_min': round(result['duration'], 2),
            'memoria_promedio_gb': round(result['avg_memory'], 2),
            'auc_promedio': round(np.mean(list(result['auc_scores'].values())), 3),
            'test_accuracy': round(result['test_accuracy'], 3),
            'test_precision': round(result['test_precision'], 3),
            'test_recall': round(result['test_recall'], 3),
            'test_f1': round(result['test_f1'], 3)
        }
        
        # Métricas por clase
        for cls, metrics in result['classification_report'].items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    if metric in ['precision', 'recall', 'f1-score', 'support']:
                        model_data[f'clase{cls}_{metric}'] = round(value, 3)
        
        csv_data.append(model_data)
        
        # Actualizar mejores modelos
        current_auc = model_data['auc_promedio']
        current_recall = model_data.get('clase1_recall', 0)
        
        # Mejor modelo global
        if current_auc > best_models['global']['score']:
            best_models['global']['score'] = current_auc
            best_models['global']['model'] = result
            
        # Mejor recall para clase 1
        if current_recall > best_models['class1_recall']['score']:
            best_models['class1_recall']['score'] = current_recall
            best_models['class1_recall']['model'] = result
            
        # Modelo más rápido
        if result['duration'] < best_models['fastest']['score']:
            best_models['fastest']['score'] = result['duration']
            best_models['fastest']['model'] = result
            
        # Menor uso de memoria
        if result['avg_memory'] < best_models['memory']['score']:
            best_models['memory']['score'] = result['avg_memory']
            best_models['memory']['model'] = result
    
    # Generar CSV
    pd.DataFrame(csv_data).to_csv(csv_file, index=False)
    
    # Generar PDF
    c = canvas.Canvas(pdf_file, pagesize=letter)
    y_position = 750
    c.setFont("Helvetica-Bold", 16)
    c.drawString(100, y_position, "Reporte Comparativo de Modelos")
    y_position -= 40
    
    # Tabla resumen
    headers = ['Dataset', 'Duración (min)', 'Memoria (GB)', 'AUC Prom', 'Recall Clase 1']
    col_widths = [120, 80, 80, 80, 100]
    
    # Encabezados de tabla
    c.setFont("Helvetica-Bold", 10)
    x_pos = 50
    for header, width in zip(headers, col_widths):
        c.drawString(x_pos, y_position, header)
        x_pos += width
    y_position -= 20
    
    # Filas de datos
    c.setFont("Helvetica", 9)
    for result in results:
        if not result:
            continue
        
        row_data = [
            result['dataset'],
            f"{result['duration']:.2f}",
            f"{result['avg_memory']:.2f}",
            f"{np.mean(list(result['auc_scores'].values())):.2f}",
            f"{result['classification_report'].get('1', {}).get('recall', 0):.2f}"
        ]
        
        x_pos = 50
        for data, width in zip(row_data, col_widths):
            c.drawString(x_pos, y_position, str(data))
            x_pos += width
        y_position -= 15
        
        if y_position < 100:
            c.showPage()
            y_position = 750
    
    # Sección de mejores modelos
    c.setFont("Helvetica-Bold", 14)
    y_position -= 30
    c.drawString(100, y_position, "Mejores Modelos:")
    y_position -= 30
    
    best_categories = [
        ('global', 'Mejor Modelo Global (AUC Promedio)'),
        ('class1_recall', 'Mejor Recall para Clase 1'),
        ('fastest', 'Modelo Más Rápido'),
        ('memory', 'Menor Uso de Memoria')
    ]
    
    for category, title in best_categories:
        model = best_models[category]['model']
        c.setFont("Helvetica-Bold", 12)
        c.drawString(120, y_position, title)
        y_position -= 20
        
        if model:
            c.setFont("Helvetica", 10)
            details = [
                f"Dataset: {model['dataset']}",
                f"Duración: {model['duration']:.2f} min",
                f"Memoria: {model['avg_memory']:.2f} GB",
                f"AUC Prom: {np.mean(list(model['auc_scores'].values())):.2f}",
                f"Recall Clase 1: {model['classification_report'].get('1', {}).get('recall', 0):.2f}"
            ]
            for detail in details:
                c.drawString(140, y_position, detail)
                y_position -= 15
        else:
            c.drawString(140, y_position, "N/A")
            y_position -= 15
        
        y_position -= 10
        
        if y_position < 100:
            c.showPage()
            y_position = 750
    
    # Gráficos comparativos
    c.showPage()
    y_position = 750
    c.setFont("Helvetica-Bold", 14)
    c.drawString(100, y_position, "Comparativa de Métricas")
    y_position -= 30
    
    # Generar y agregar gráficos comparativos
    metrics = ['duration', 'avg_memory', 'auc_promedio']
    for metric in metrics:
        plt.figure(figsize=(10, 6))
        values = [r[metric] for r in csv_data if metric in r]
        labels = [r['dataset'] for r in csv_data if metric in r]
        plt.barh(labels, values)
        plt.title(f'Comparación de {metric.replace("_", " ").title()}')
        plot_path = os.path.join("graficos", f"comparativa_{metric}.png")
        plt.savefig(plot_path, bbox_inches='tight')
        plt.close()
        
        c.drawImage(plot_path, 50, y_position-250, width=500, height=200)
        y_position -= 270
        os.remove(plot_path)
        
        if y_position < 100:
            c.showPage()
            y_position = 750
    
    c.save()
    
    print(f"\nReportes generados:\nCSV: {csv_file}\nPDF: {pdf_file}")
    return csv_file, pdf_file

def main():
    input_root = r"C:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\train_test_splits_20250224_221733"
    output_root = os.path.join(input_root, "reportes_modelos")
    os.makedirs(output_root, exist_ok=True)
    
    train_files = []
    for root, _, files in os.walk(input_root):
        for file in files:
            if file.startswith("X_train") and file.endswith(".parquet"):
                base = file.replace("X_train_", "").replace(".parquet", "")
                required = {
                    'X_train': file,
                    'y_train': f"y_train_{base}.parquet",
                    'X_test': f"X_test_{base}.parquet",
                    'y_test': f"y_test_{base}.parquet"
                }
                
                paths = {}
                for k, v in required.items():
                    full_path = os.path.join(root, v)
                    if os.path.exists(full_path):
                        paths[k] = full_path
                    else:
                        break
                
                if len(paths) == 4:
                    train_files.append(paths)
    
    results = []
    for dataset in train_files:
        print(f"\nProcesando: {dataset['X_train']}")
        result = train_and_evaluate_model(
            dataset['X_train'],
            dataset['y_train'],
            dataset['X_test'],
            dataset['y_test']
        )
        if result:
            results.append(result)
            print(f"✅ Procesado correctamente")
    
    if results:
        generate_pdf_report(results, output_root)
    else:
        print("\n⚠️ No se encontraron resultados válidos")

if __name__ == "__main__":
    main()


Procesando: C:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\train_test_splits_20250224_221733\MaxAbs_full_full\X_train_MaxAbs_full_full.parquet
Calculando medianas...
Calculando pesos de clase...
Selección inicial de características...
Época 1 - Loss: 0.8992 | Test Acc: 0.74 | Test F1: 0.68
Época 2 - Loss: 0.8934 | Test Acc: 0.74 | Test F1: 0.69
Época 3 - Loss: 0.8925 | Test Acc: 0.74 | Test F1: 0.69
Época 4 - Loss: 0.8921 | Test Acc: 0.74 | Test F1: 0.69
Época 5 - Loss: 0.8920 | Test Acc: 0.74 | Test F1: 0.69
✅ Mejor modelo en época 5 - F1: 0.69
Distribución de clases en el conjunto de prueba: {4: 31091, 1: 31091, 5: 31090, 3: 31090, 2: 31090}
✅ Procesado correctamente

Procesando: C:\Users\Administrador\Documents\PythonScripts\Tesis\tesisaustral\outputs\experiments\train_test_splits_20250224_221733\MaxAbs_Linear_selected\X_train_MaxAbs_Linear_selected.parquet
Calculando medianas...
Calculando pesos de clase...
Selección inicial de característica