In [1]:
import dask.dataframe as dd
import gc
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np

# Cargar el DataFrame desde el archivo Parquet utilizando Dask
data_dd = dd.read_parquet(r'C:\Users\HOME\OneDrive - Universidad Nacional de Colombia\maestria_big_data\clases\TFM\codigo_TFM\data.parquet', engine='pyarrow')

# Convertir los datos de Dask a pandas para el preprocesamiento
data_pd = data_dd.compute()
X = data_pd['susceptibilidad'].values.reshape(-1, 1)
y = data_pd['inventario'].values

# Liberar memoria
del data_dd, data_pd
gc.collect()

# Dividir los datos en conjuntos de entrenamiento, validación y prueba
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Función para crear, entrenar y evaluar el modelo con diferentes pesos de clases
def train_and_evaluate(class_weight):
    # Construir el modelo secuencial
    model = Sequential([
        Dense(50, activation='relu', input_shape=(1,)),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    
    # Compilar el modelo
    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    # Configurar EarlyStopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    # Entrenar el modelo con los datos de entrenamiento y validación
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100, 
        batch_size=32, 
        verbose=0,
        callbacks=[early_stopping],
        class_weight=class_weight
    )
    
    # Evaluar el modelo en el conjunto de validación
    val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
    y_val_pred = model.predict(X_val).round()
    roc_auc = roc_auc_score(y_val, y_val_pred)
    
    return val_accuracy, roc_auc, model

# Paso 2: Probar diferentes pesos de clase
weights = [{0: 1, 1: w} for w in range(1, 51)]  # Probar pesos de 1 a 50 para la clase minoritaria
results = []

for weight in weights:
    val_accuracy, roc_auc, model = train_and_evaluate(weight)
    results.append((weight, val_accuracy, roc_auc))
    print(f'Pesos: {weight}, Validación Accuracy: {val_accuracy}, ROC AUC: {roc_auc}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step
Pesos: {0: 1, 1: 1}, Validación Accuracy: 0.7455099821090698, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 663us/step
Pesos: {0: 1, 1: 2}, Validación Accuracy: 0.7455099821090698, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 595us/step
Pesos: {0: 1, 1: 3}, Validación Accuracy: 0.7455099821090698, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 611us/step
Pesos: {0: 1, 1: 4}, Validación Accuracy: 0.3568281829357147, ROC AUC: 0.4980368599443167


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 621us/step
Pesos: {0: 1, 1: 5}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 661us/step
Pesos: {0: 1, 1: 6}, Validación Accuracy: 0.26499491930007935, ROC AUC: 0.500467861033773


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 675us/step
Pesos: {0: 1, 1: 7}, Validación Accuracy: 0.275838702917099, ROC AUC: 0.5002859823265949


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 561us/step
Pesos: {0: 1, 1: 8}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step
Pesos: {0: 1, 1: 9}, Validación Accuracy: 0.256862074136734, ROC AUC: 0.5002753903885728


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 600us/step
Pesos: {0: 1, 1: 10}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 599us/step
Pesos: {0: 1, 1: 11}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 602us/step
Pesos: {0: 1, 1: 12}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 599us/step
Pesos: {0: 1, 1: 13}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 608us/step
Pesos: {0: 1, 1: 14}, Validación Accuracy: 0.2605896294116974, ROC AUC: 0.49926734051567606


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 600us/step
Pesos: {0: 1, 1: 15}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 601us/step
Pesos: {0: 1, 1: 16}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 516us/step
Pesos: {0: 1, 1: 17}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 607us/step
Pesos: {0: 1, 1: 18}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 577us/step
Pesos: {0: 1, 1: 19}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 603us/step
Pesos: {0: 1, 1: 20}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 479us/step
Pesos: {0: 1, 1: 21}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 457us/step
Pesos: {0: 1, 1: 22}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 606us/step
Pesos: {0: 1, 1: 23}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 602us/step
Pesos: {0: 1, 1: 24}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 709us/step
Pesos: {0: 1, 1: 25}, Validación Accuracy: 0.2575398087501526, ROC AUC: 0.5007299358431183


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 603us/step
Pesos: {0: 1, 1: 26}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 602us/step
Pesos: {0: 1, 1: 27}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 580us/step
Pesos: {0: 1, 1: 28}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 578us/step
Pesos: {0: 1, 1: 29}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 463us/step
Pesos: {0: 1, 1: 30}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 587us/step
Pesos: {0: 1, 1: 31}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 576us/step
Pesos: {0: 1, 1: 32}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 608us/step
Pesos: {0: 1, 1: 33}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 606us/step
Pesos: {0: 1, 1: 34}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 597us/step
Pesos: {0: 1, 1: 35}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 502us/step
Pesos: {0: 1, 1: 36}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 592us/step
Pesos: {0: 1, 1: 37}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609us/step
Pesos: {0: 1, 1: 38}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609us/step
Pesos: {0: 1, 1: 39}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 686us/step
Pesos: {0: 1, 1: 40}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 721us/step
Pesos: {0: 1, 1: 41}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 562us/step
Pesos: {0: 1, 1: 42}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 566us/step
Pesos: {0: 1, 1: 43}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 573us/step
Pesos: {0: 1, 1: 44}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 584us/step
Pesos: {0: 1, 1: 45}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 701us/step
Pesos: {0: 1, 1: 46}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 633us/step
Pesos: {0: 1, 1: 47}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step
Pesos: {0: 1, 1: 48}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 600us/step
Pesos: {0: 1, 1: 49}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 466us/step
Pesos: {0: 1, 1: 50}, Validación Accuracy: 0.2544900178909302, ROC AUC: 0.5


In [2]:
# Paso 3: Seleccionar el mejor modelo basado en ROC AUC
best_weight = max(results, key=lambda x: x[2])  # Seleccionar por mejor ROC AUC
print(f'Mejor configuración de pesos: {best_weight[0]}, Accuracy: {best_weight[1]}, ROC AUC: {best_weight[2]}')

# Reentrenar el mejor modelo completo con los datos de entrenamiento y validación
best_model = Sequential([
    Dense(50, activation='relu', input_shape=(1,)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

best_model.compile(optimizer=Adam(learning_rate=0.001), 
                   loss='binary_crossentropy', 
                   metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

best_model.fit(
    np.concatenate((X_train, X_val)), 
    np.concatenate((y_train, y_val)),
    epochs=100, 
    batch_size=32, 
    verbose=1,
    callbacks=[early_stopping],
    class_weight=best_weight[0]
)




Mejor configuración de pesos: {0: 1, 1: 25}, Accuracy: 0.2575398087501526, ROC AUC: 0.5007299358431183
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 492us/step - accuracy: 0.3216 - loss: 85.6259
Epoch 2/100
[1m303/523[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 510us/step - accuracy: 0.3024 - loss: 47.8522

  current = self.get_monitor_value(logs)


[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 508us/step - accuracy: 0.3051 - loss: 44.8261
Epoch 3/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 495us/step - accuracy: 0.3115 - loss: 26.3444
Epoch 4/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 482us/step - accuracy: 0.3049 - loss: 11.2205
Epoch 5/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 485us/step - accuracy: 0.2798 - loss: 4.6968
Epoch 6/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 511us/step - accuracy: 0.2522 - loss: 3.4234
Epoch 7/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 507us/step - accuracy: 0.2525 - loss: 2.7466
Epoch 8/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497us/step - accuracy: 0.2495 - loss: 2.6461
Epoch 9/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 481us/step - accuracy: 0.2593 - loss: 2.8888
Epoch 10/100
[1m523/523

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
# Evaluar el modelo en el conjunto de prueba
test_loss, test_accuracy = best_model.evaluate(X_test, y_test, verbose=0)
print(f'Loss en el conjunto de prueba: {test_loss}')
print(f'Accuracy en el conjunto de prueba: {test_accuracy}')



Loss en el conjunto de prueba: 1.714834451675415
Accuracy en el conjunto de prueba: 0.24695122241973877
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 387us/step
[[   0 2223]
 [   0  729]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      2223
         1.0       0.25      1.00      0.40       729

    accuracy                           0.25      2952
   macro avg       0.12      0.50      0.20      2952
weighted avg       0.06      0.25      0.10      2952



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
# Predecir y mostrar métricas adicionales en el conjunto de prueba
y_test_pred = best_model.predict(X_test).round()
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 397us/step
[[   0 2223]
 [   0  729]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      2223
         1.0       0.25      1.00      0.40       729

    accuracy                           0.25      2952
   macro avg       0.12      0.50      0.20      2952
weighted avg       0.06      0.25      0.10      2952



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
