In [1]:
import dask.dataframe as dd
import gc
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense, Lambda, Input
from keras.optimizers import Adam
from keras import backend as K
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from keras.losses import BinaryCrossentropy
from keras.regularizers import l2
from keras.layers import Dropout

# Cargar el DataFrame desde el archivo Parquet utilizando Dask
data_dd = dd.read_parquet(r'C:\Users\HOME\OneDrive - Universidad Nacional de Colombia\maestria_big_data\clases\TFM\codigo_TFM\data.parquet', engine='pyarrow')

# Convertir los datos de Dask a pandas para el preprocesamiento
data_pd = data_dd.compute()
X = data_pd['susceptibilidad'].values.reshape(-1, 1)
y = data_pd['inventario'].values

# Liberar memoria
del data_dd, data_pd
gc.collect()

# Dividir los datos en conjuntos de entrenamiento, validación y prueba
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [2]:
# Aplicar SMOTE para balancear el conjunto de entrenamiento
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [3]:
# Estandarizar los datos
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


In [4]:
from keras.layers import Activation

# Definir la función de activación Kolmogorov
def kolmogorov_activation(x):
    # Aplicar softplus para manejar adecuadamente los valores
    return K.softplus(x)

In [5]:
# Ajustar la función de pérdida con pesos de clase
class_weights = {0: 1, 1: 10}  # Aumentar el peso para la clase minoritaria

In [6]:
# Construir y compilar el modelo con regularización y dropout para mayor estabilidad
model = Sequential([
    Dense(50, activation='relu', input_shape=(1,), kernel_regularizer=l2(0.01)),  # Añadir regularización L2
    Dropout(0.5),  # Añadir Dropout para reducir sobreajuste
    Dense(50, activation='softplus', kernel_regularizer=l2(0.01)),  # Cambiar activación para mayor estabilidad
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compilar el modelo
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss=BinaryCrossentropy(from_logits=False), 
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [7]:
# Configurar EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Entrenar el modelo
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100, 
    batch_size=32,  # Incrementar el tamaño del batch
    verbose=1,
    callbacks=[early_stopping], 
    class_weight=class_weights
)

Epoch 1/100
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 937us/step - accuracy: 0.4217 - loss: 44.1353 - val_accuracy: 0.7384 - val_loss: 0.8796
Epoch 2/100
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 685us/step - accuracy: 0.3568 - loss: 13.2339 - val_accuracy: 0.7306 - val_loss: 0.9043
Epoch 3/100
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 686us/step - accuracy: 0.3162 - loss: 4.1413 - val_accuracy: 0.2545 - val_loss: 0.9592
Epoch 4/100
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 717us/step - accuracy: 0.2766 - loss: 3.1163 - val_accuracy: 0.2545 - val_loss: 1.0342
Epoch 5/100
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 715us/step - accuracy: 0.2648 - loss: 2.7063 - val_accuracy: 0.2545 - val_loss: 1.0549
Epoch 6/100
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 681us/step - accuracy: 0.2580 - loss: 2.2281 - val_accuracy: 0.2545 - val_loss: 1.1156
Epoch 7/

In [8]:
# Evaluar el modelo en el conjunto de prueba
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f'Loss en el conjunto de prueba: {test_loss}')
print(f'Accuracy en el conjunto de prueba: {test_accuracy}')

# Predecir y mostrar métricas adicionales
y_test_pred = model.predict(X_test).round()
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


Loss en el conjunto de prueba: 0.8788866400718689
Accuracy en el conjunto de prueba: 0.7432249188423157
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498us/step
[[2179   44]
 [ 714   15]]
              precision    recall  f1-score   support

         0.0       0.75      0.98      0.85      2223
         1.0       0.25      0.02      0.04       729

    accuracy                           0.74      2952
   macro avg       0.50      0.50      0.44      2952
weighted avg       0.63      0.74      0.65      2952

