In [17]:
import json
import pandas as pd
from tpot import TPOTClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import pickle
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime
import re

In [18]:
# Leer el archivo de control
with open('control_config.json', 'r') as f:
    config = json.load(f)

In [19]:
# Función para limpiar nombres de columnas
def clean_column_names(df):
    df.columns = [re.sub(r'[^A-Za-z0-9_]', '', col) for col in df.columns]
    return df

In [20]:
# Función para reducir el uso de memoria
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > -128 and c_max < 127:
                    df[col] = df[col].astype('int8')
                elif c_min > -32768 and c_max < 32767:
                    df[col] = df[col].astype('int16')
                elif c_min > -2147483648 and c_max < 2147483647:
                    df[col] = df[col].astype('int32')
                else:
                    df[col] = df[col].astype('int64')
            else:
                if c_min > -3.4e38 and c_max < 3.4e38:
                    df[col] = df[col].astype('float32')
                else:
                    df[col] = df[col].astype('float64')
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')
    return df

In [21]:
# Variantes de normalización
variantes = ["No_Norm", "MinMax", "Std", "Maxabs", "Robust"]

In [22]:
# Obtener la fecha actual para los nombres de archivo
fecha_actual = datetime.now().strftime("%Y%m%d")

In [23]:
resultados = {}
for variante in variantes:
    try:
        # Cargar los conjuntos de datos específicos de la variante
        X_train = pd.read_parquet(config['data_paths'][f'X_train_{variante}'])
        X_test = pd.read_parquet(config['data_paths'][f'X_test_{variante}'])
        y_train = pd.read_parquet(config['data_paths'][f'y_train_{variante}']).values.ravel()
        y_test = pd.read_parquet(config['data_paths'][f'y_test_{variante}']).values.ravel()
        
        if X_train is None or X_test is None:
            print(f"Skipping variant {variante} due to memory issues.")
            continue
        
        # Reducir uso de memoria
        X_train = reduce_mem_usage(X_train)
        X_test = reduce_mem_usage(X_test)
        
        # Limpiar nombres de las columnas
        X_train = clean_column_names(X_train)
        X_test = clean_column_names(X_test)
        
        # Configuración de TPOT
        tpot = TPOTClassifier(
            generations=config['tpot_config']['generations'],
            population_size=config['tpot_config']['population_size'],
            verbosity=config['tpot_config']['verbosity']
        )
        tpot.fit(X_train, y_train)
        tpot_predictions = tpot.predict(X_test)
        tpot_accuracy = accuracy_score(y_test, tpot_predictions)
        print(f'TPOT Accuracy for {variante}: {tpot_accuracy}')
        
        # Guardar el mejor modelo TPOT con la fecha
        model_filename = f'best_tpot_model_{variante}_{fecha_actual}.py'
        tpot.export(model_filename)
        
        # Guardar la configuración del mejor modelo
        best_pipeline = str(tpot.fitted_pipeline_)
        best_model_info = {
            "best_pipeline": best_pipeline,
            "accuracy": tpot_accuracy,
            "date": fecha_actual
        }
        config_filename = f'best_tpot_model_config_{variante}_{fecha_actual}.json'
        with open(config_filename, 'w') as f:
            json.dump(best_model_info, f)
        
        # Almacenar resultados
        resultados[variante] = {
            "accuracy": tpot_accuracy,
            "model_filename": model_filename,
            "config_filename": config_filename
        }
        
        # Calcular y graficar curvas ROC
        y_test_binarized = pd.get_dummies(y_test)
        y_score = tpot.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test_binarized, y_score, average=None)
        print(f'ROC AUC Score for each class ({variante}): {roc_auc}')
        
        # Crear PDF con las métricas y las curvas ROC
        with PdfPages(f'tpot_model_analysis_report_{variante}_{fecha_actual}.pdf') as pdf:
            plt.figure(figsize=(8, 6))
            plt.text(0.01, 0.05, str(classification_report(y_test, tpot_predictions)), {'fontsize': 10}, fontproperties='monospace')
            plt.axis('off')
            plt.title(f'Classification Report ({variante})')
            pdf.savefig()
            plt.close()

            plt.figure(figsize=(8, 6))
            for i in range(y_test_binarized.shape[1]):
                fpr, tpr, _ = roc_curve(y_test_binarized.iloc[:, i], y_score[:, i])
                plt.plot(fpr, tpr, lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
            plt.plot([0, 1], [0, 1], 'k--', lw=2)
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'Receiver Operating Characteristic ({variante})')
            plt.legend(loc="lower right")
            pdf.savefig()
            plt.close()
    except Exception as e:
        print(f"An error occurred for variant {variante}: {e}")
        continue

Memory usage after optimization is: 357.91 MB
Decreased by 81.5%
Memory usage after optimization is: 89.48 MB
Decreased by 81.5%
Imputing missing values in feature set


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]

  File "c:\Users\Gonzalo\Documents\GitHub\00_tesisaustral\tesisaustral\tesisaustral\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")



Generation 1 - Current best internal CV score: 0.41639161163325317

Generation 2 - Current best internal CV score: 0.41639161163325317

Generation 3 - Current best internal CV score: 0.41639161163325317

Generation 4 - Current best internal CV score: 0.49604584072970975

Generation 5 - Current best internal CV score: 0.49604584072970975

Best pipeline: LogisticRegression(StandardScaler(input_matrix), C=15.0, dual=False, penalty=l2)
Imputing missing values in feature set
TPOT Accuracy for No_Norm: 0.499045478063837
Imputing missing values in feature set
ROC AUC Score for each class (No_Norm): [0.81481766 0.82657166 0.72844401 0.73584721 0.93602141]
Memory usage after optimization is: 417.78 MB
Decreased by 78.4%
Memory usage after optimization is: 104.45 MB
Decreased by 78.4%
Imputing missing values in feature set


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.42652792249851

Generation 2 - Current best internal CV score: 0.426666190682132

Generation 3 - Current best internal CV score: 0.426666190682132

Generation 4 - Current best internal CV score: 0.42666842088351925

Generation 5 - Current best internal CV score: 0.4269315863880559

Best pipeline: DecisionTreeClassifier(input_matrix, criterion=entropy, max_depth=10, min_samples_leaf=10, min_samples_split=17)
Imputing missing values in feature set
TPOT Accuracy for MinMax: 0.4311941337044372
Imputing missing values in feature set
ROC AUC Score for each class (MinMax): [0.74392204 0.75796809 0.64603862 0.65424188 0.85119567]
Memory usage after optimization is: 417.78 MB
Decreased by 78.4%
Memory usage after optimization is: 104.45 MB
Decreased by 78.4%
Imputing missing values in feature set


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.4225269202699756

Generation 2 - Current best internal CV score: 0.4225269202699756

Generation 3 - Current best internal CV score: 0.4270207945927589

Generation 4 - Current best internal CV score: 0.42702079479171307

Generation 5 - Current best internal CV score: 0.4270475572829671

Best pipeline: DecisionTreeClassifier(RobustScaler(input_matrix), criterion=entropy, max_depth=10, min_samples_leaf=7, min_samples_split=17)
Imputing missing values in feature set
TPOT Accuracy for Std: 0.43113168834412746
Imputing missing values in feature set
ROC AUC Score for each class (Std): [0.74388409 0.75761667 0.64613983 0.65426399 0.85099667]
Memory usage after optimization is: 417.78 MB
Decreased by 78.4%
Memory usage after optimization is: 104.45 MB
Decreased by 78.4%
Imputing missing values in feature set


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.40281631535713414

Generation 2 - Current best internal CV score: 0.4960681427435816

Generation 3 - Current best internal CV score: 0.4960681427435816

Generation 4 - Current best internal CV score: 0.4960681427435816

Generation 5 - Current best internal CV score: 0.4960681427435816

Best pipeline: LogisticRegression(StandardScaler(input_matrix), C=10.0, dual=False, penalty=l2)
Imputing missing values in feature set
TPOT Accuracy for Maxabs: 0.49903655729807844
Imputing missing values in feature set
ROC AUC Score for each class (Maxabs): [0.81481792 0.82657161 0.72844401 0.73584731 0.93602145]
Memory usage after optimization is: 417.78 MB
Decreased by 78.4%
Memory usage after optimization is: 104.45 MB
Decreased by 78.4%
Imputing missing values in feature set


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.496094905359182

Generation 2 - Current best internal CV score: 0.496094905359182

Generation 3 - Current best internal CV score: 0.496094905359182

Generation 4 - Current best internal CV score: 0.496094905359182

Generation 5 - Current best internal CV score: 0.496094905359182

Best pipeline: LogisticRegression(StandardScaler(input_matrix), C=1.0, dual=False, penalty=l2)
Imputing missing values in feature set
TPOT Accuracy for Robust: 0.49903655729807844
Imputing missing values in feature set
ROC AUC Score for each class (Robust): [0.81482215 0.8265695  0.72844534 0.73585092 0.93602271]
