In [30]:
from datetime import datetime
import json
import gc
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from flaml import AutoML
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import pickle
import re

In [31]:
# Obtener la fecha actual
fecha_actual = datetime.now().strftime("%Y%m%d")

In [32]:
# Definir función para cargar datos en partes y reducir memoria
def load_data_in_chunks(file_path, chunk_size=100000):
    chunk_list = []
    try:
        for chunk in pd.read_parquet(file_path, chunksize=chunk_size):
            chunk_list.append(chunk)
    except MemoryError:
        print(f"Memory error occurred while reading {file_path}.")
        return None
    return pd.concat(chunk_list)

# Reducción de tipo de datos
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(start_mem - end_mem) / start_mem * 100:.1f}%")
    return df

In [33]:
# Leer el archivo de control
with open('control_config.json', 'r') as f:
    config = json.load(f)


In [34]:
# Función para limpiar nombres de las columnas
def clean_column_names(df):
    df.columns = [re.sub(r'[^A-Za-z0-9_]', '', col) for col in df.columns]
    return df

In [35]:
# Definir las variantes de normalización
variantes = ['No_Norm', 'MinMax', 'Std', 'Maxabs', 'Robust']  # Lista de variantes a considerar

# Almacenar resultados para cada variante
resultados = {}

In [36]:
for variante in variantes:
    # Cargar los conjuntos de datos específicos de la variante
    X_train = pd.read_parquet(config['data_paths'][f'X_train_{variante}'])
    X_test = pd.read_parquet(config['data_paths'][f'X_test_{variante}'])
    y_train = pd.read_parquet(config['data_paths'][f'y_train_{variante}']).values.ravel()
    y_test = pd.read_parquet(config['data_paths'][f'y_test_{variante}']).values.ravel()
    
    if X_train is None or X_test is None:
        print(f"Skipping variant {variante} due to memory issues.")
        continue
    
    # Reducir uso de memoria
    X_train = reduce_mem_usage(X_train)
    X_test = reduce_mem_usage(X_test)
    
# Crear pipeline con imputación de valores faltantes, PCA y selección de características
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Imputar valores faltantes con la media
        ('pca', PCA(n_components=0.95)),  # Reducir dimensionalidad manteniendo el 95% de la varianza
        ('selector', SelectKBest(f_classif, k=100))  # Selección de las 100 mejores características
    ])
    
    # Transformar los datos de entrenamiento
    try:
        X_train_reduced = pipeline.fit_transform(X_train, y_train)
        X_test_reduced = pipeline.transform(X_test)
    except MemoryError:
        print(f"Memory error during feature selection for variant {variante}. Skipping.")
        continue
    
    
     # Convertir a DataFrame y limpiar nombres de las columnas
    X_train_reduced = pd.DataFrame(X_train_reduced, columns=[f"feature_{i}" for i in range(X_train_reduced.shape[1])])
    X_test_reduced = pd.DataFrame(X_test_reduced, columns=[f"feature_{i}" for i in range(X_test_reduced.shape[1])])


    # Limpiar nombres de las columnas
    X_train_reduced = clean_column_names(X_train_reduced)
    X_test_reduced = clean_column_names(X_test_reduced)

    # Optimización de uso de memoria
    X_train_reduced = X_train_reduced.astype('float32')
    X_test_reduced = X_test_reduced.astype('float32')

# Configuración de FLAML AutoML
    automl = AutoML()
    automl_settings = {
    "time_budget": 1800,  # tiempo máximo de búsqueda en segundos
    "metric": 'accuracy',  # métrica de evaluación
    "task": 'classification',  # tarea de clasificación
    "log_file_name": "flaml_log.log",  # archivo de registro
    "n_jobs": 1,  # número de trabajos paralelos
    "mem_thres": 4 * 1024**3,  # umbral de memoria
    "ensemble": False,  # deshabilitar ensamblado
    "estimator_list": ["rf", "lgbm", "catboost", "extra_tree", "histgb"],  # modelos a considerar
    "fit_kwargs_by_estimator": {
        "rf": {},
        "lgbm": {},
        "catboost": {},
        "extra_tree": {},
        "histgb": {}
    }
}

# Entrenar el modelo con FLAML
    try:
        automl.fit(X_train=X_train_reduced, y_train=y_train, **automl_settings)
    except MemoryError:
        print(f"Memory error during model training for variant {variante}. Skipping.")
        continue


    # Hacer predicciones
    predictions = automl.predict(X_test_reduced)
    accuracy = accuracy_score(y_test, predictions)
    classification_rep = classification_report(y_test, predictions)
    print(f'FLAML Accuracy for {variante}: {accuracy}')
    print("Classification Report:")
    print(classification_rep)

  # Guardar el mejor modelo con la fecha
    model_filename = f'best_flaml_model_{variante}_{fecha_actual}.pkl'
    with open(model_filename, 'wb') as f:
        pickle.dump(automl, f)
        
    # Obtener y guardar la configuración del mejor modelo
    best_model_config = automl.best_config
    ensemble_status = automl_settings.get("ensemble", False)
    best_model_info = {
        "best_model_config": best_model_config,
        "ensemble": ensemble_status,
        "accuracy": accuracy,
        "classification_report": classification_rep,
        "date": fecha_actual
    }
    config_filename = f'best_flaml_model_config_{variante}_{fecha_actual}.json'
    with open(config_filename, 'w') as f:
        json.dump(best_model_info, f)

    # Almacenar resultados
    resultados[variante] = {
        "accuracy": accuracy,
        "classification_report": classification_rep,
        "model_filename": model_filename
    }

    # Calcular y graficar curvas ROC
    y_test_binarized = pd.get_dummies(y_test)
    y_score = automl.predict_proba(X_test_reduced)
    roc_auc = roc_auc_score(y_test_binarized, y_score, average=None)
    print(f'ROC AUC Score for each class ({variante}): {roc_auc}')

    # Crear PDF con las métricas y las curvas ROC
    with PdfPages(f'flaml_model_analysis_report_{variante}_{fecha_actual}.pdf') as pdf:
        plt.figure(figsize=(8, 6))
        plt.text(0.01, 0.05, str(classification_rep), {'fontsize': 10}, fontproperties='monospace')
        plt.axis('off')
        plt.title(f'Classification Report ({variante})')
        pdf.savefig()
        plt.close()

        plt.figure(figsize=(8, 6))
        for i in range(y_test_binarized.shape[1]):
            fpr, tpr, _ = roc_curve(y_test_binarized.iloc[:, i], y_score[:, i])
            plt.plot(fpr, tpr, lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'Receiver Operating Characteristic ({variante})')
        plt.legend(loc="lower right")
        pdf.savefig()
        plt.close()

Memory usage of dataframe is 1936.25 MB
Memory usage after optimization is: 357.91 MB
Decreased by 81.5%
Memory usage of dataframe is 484.07 MB
Memory usage after optimization is: 89.48 MB
Decreased by 81.5%




[flaml.automl.logger: 08-04 17:06:50] {1680} INFO - task = classification
[flaml.automl.logger: 08-04 17:06:50] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 08-04 17:06:51] {1789} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 08-04 17:06:51] {1901} INFO - List of ML learners in AutoML Run: ['rf', 'lgbm', 'catboost', 'extra_tree', 'histgb']
[flaml.automl.logger: 08-04 17:06:51] {2219} INFO - iteration 0, current learner rf
[flaml.automl.logger: 08-04 17:06:51] {2345} INFO - Estimated sufficient time budget=32282s. Estimated necessary time budget=34s.
[flaml.automl.logger: 08-04 17:06:51] {2392} INFO -  at 0.4s,	estimator rf's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 17:06:51] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-04 17:06:51] {2392} INFO -  at 0.5s,	estimator lgbm's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 17:06:51] {2219} INFO -



[flaml.automl.logger: 08-04 17:41:50] {1680} INFO - task = classification
[flaml.automl.logger: 08-04 17:41:50] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 08-04 17:41:51] {1789} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 08-04 17:41:51] {1901} INFO - List of ML learners in AutoML Run: ['rf', 'lgbm', 'catboost', 'extra_tree', 'histgb']
[flaml.automl.logger: 08-04 17:41:51] {2219} INFO - iteration 0, current learner rf
[flaml.automl.logger: 08-04 17:41:51] {2345} INFO - Estimated sufficient time budget=67184s. Estimated necessary time budget=70s.
[flaml.automl.logger: 08-04 17:41:51] {2392} INFO -  at 1.0s,	estimator rf's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 17:41:51] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-04 17:41:51] {2392} INFO -  at 1.1s,	estimator lgbm's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 17:41:51] {2219} INFO -



[flaml.automl.logger: 08-04 18:13:05] {1680} INFO - task = classification
[flaml.automl.logger: 08-04 18:13:05] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 08-04 18:13:06] {1789} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 08-04 18:13:06] {1901} INFO - List of ML learners in AutoML Run: ['rf', 'lgbm', 'catboost', 'extra_tree', 'histgb']
[flaml.automl.logger: 08-04 18:13:06] {2219} INFO - iteration 0, current learner rf
[flaml.automl.logger: 08-04 18:13:06] {2345} INFO - Estimated sufficient time budget=35679s. Estimated necessary time budget=37s.
[flaml.automl.logger: 08-04 18:13:06] {2392} INFO -  at 0.5s,	estimator rf's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 18:13:06] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-04 18:13:06] {2392} INFO -  at 0.6s,	estimator lgbm's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 18:13:06] {2219} INFO -



[flaml.automl.logger: 08-04 18:44:52] {1680} INFO - task = classification
[flaml.automl.logger: 08-04 18:44:52] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 08-04 18:44:52] {1789} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 08-04 18:44:52] {1901} INFO - List of ML learners in AutoML Run: ['rf', 'lgbm', 'catboost', 'extra_tree', 'histgb']
[flaml.automl.logger: 08-04 18:44:52] {2219} INFO - iteration 0, current learner rf
[flaml.automl.logger: 08-04 18:44:53] {2345} INFO - Estimated sufficient time budget=35619s. Estimated necessary time budget=37s.
[flaml.automl.logger: 08-04 18:44:53] {2392} INFO -  at 0.6s,	estimator rf's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 18:44:53] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-04 18:44:53] {2392} INFO -  at 0.7s,	estimator lgbm's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 18:44:53] {2219} INFO -



[flaml.automl.logger: 08-04 19:16:41] {1680} INFO - task = classification
[flaml.automl.logger: 08-04 19:16:41] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 08-04 19:16:42] {1789} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 08-04 19:16:42] {1901} INFO - List of ML learners in AutoML Run: ['rf', 'lgbm', 'catboost', 'extra_tree', 'histgb']
[flaml.automl.logger: 08-04 19:16:42] {2219} INFO - iteration 0, current learner rf
[flaml.automl.logger: 08-04 19:16:42] {2345} INFO - Estimated sufficient time budget=35728s. Estimated necessary time budget=37s.
[flaml.automl.logger: 08-04 19:16:42] {2392} INFO -  at 0.5s,	estimator rf's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 19:16:42] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-04 19:16:42] {2392} INFO -  at 0.6s,	estimator lgbm's best error=0.7232,	best estimator rf's best error=0.7232
[flaml.automl.logger: 08-04 19:16:42] {2219} INFO -

In [37]:
# Comparación de resultados entre variantes
print("Resultados de todas las variantes:")
for variante, resultado in resultados.items():
    print(f"Variante: {variante}")
    print(f"Accuracy: {resultado['accuracy']}")
    print(f"Model File: {resultado['model_filename']}")
    print(f"Classification Report:\n{resultado['classification_report']}")

Resultados de todas las variantes:
Variante: No_Norm
Accuracy: 0.31739192492283536
Model File: best_flaml_model_No_Norm_20240804.pkl
Classification Report:
              precision    recall  f1-score   support

           1       0.23      0.12      0.15     14641
           2       0.30      0.23      0.26     19033
           3       0.24      0.20      0.22     22581
           4       0.26      0.27      0.26     24493
           5       0.40      0.59      0.48     31350

    accuracy                           0.32    112098
   macro avg       0.29      0.28      0.27    112098
weighted avg       0.30      0.32      0.30    112098

Variante: MinMax
Accuracy: 0.31501008046530715
Model File: best_flaml_model_MinMax_20240804.pkl
Classification Report:
              precision    recall  f1-score   support

           1       0.22      0.13      0.16     14641
           2       0.29      0.24      0.26     19033
           3       0.24      0.21      0.23     22581
           4       

In [39]:
# Nombre del archivo PDF con la fecha actual
pdf_filename = f'resultados_comparacion_{fecha_actual}.pdf'

# Crear un archivo PDF y escribir los resultados
with PdfPages(pdf_filename) as pdf:
    # Crear una nueva figura
    plt.figure(figsize=(8.5, 11))  # Tamaño A4
    plt.axis('off')
    
    # Título del PDF
    plt.text(0.5, 0.95, "Comparación de resultados entre variantes", fontsize=16, ha='center')
    
    # Escribir los resultados
    y_position = 0.85
    for variante, resultado in resultados.items():
        plt.text(0.5, y_position, f"Variante: {variante}", fontsize=12, ha='center')
        y_position -= 0.03  # Espacio después del nombre de la variante
        plt.text(0.5, y_position, f"Accuracy: {resultado['accuracy']}", fontsize=10, ha='center')
        y_position -= 0.03  # Espacio después del accuracy
        plt.text(0.5, y_position, f"Model File: {resultado['model_filename']}", fontsize=10, ha='center')
        y_position -= 0.03  # Espacio después del nombre del archivo

        # Escribir el informe de clasificación en múltiples líneas si es necesario
        lines = resultado['classification_report'].split('\n')
        for line in lines:
            plt.text(0.5, y_position, line, fontsize=8, ha='center')
            y_position -= 0.02  # Espacio entre líneas del informe de clasificación

        y_position -= 0.05  # Espacio adicional después del informe

        # Agregar un separador
        plt.text(0.5, y_position, '-'*80, fontsize=10, ha='center')
        y_position -= 0.03  # Espacio después del separador

        # Asegurarse de no escribir más allá de la página
        if y_position < 0.1:
            pdf.savefig()  # Guardar la página actual en el PDF
            plt.close()    # Cerrar la figura actual
            plt.figure(figsize=(8.5, 11))  # Nueva figura para nueva página
            plt.axis('off')
            y_position = 0.95  # Reiniciar la posición vertical

    pdf.savefig()  # Guardar la última página en el PDF
    plt.close()    # Cerrar la figura