In [1]:
import dask.dataframe as dd
import gc
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# Cargar el DataFrame desde el archivo Parquet utilizando Dask
data_dd = dd.read_parquet(r'C:\Users\HOME\OneDrive - Universidad Nacional de Colombia\maestria_big_data\clases\TFM\codigo_TFM\data.parquet', engine='pyarrow')

# Separar las características y la etiqueta
X = data_dd['susceptibilidad'].compute().values.reshape(-1, 1)
y = data_dd['inventario'].compute()

# Dividir los datos en conjuntos de entrenamiento, validación y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Aplicar SMOTE para balancear el conjunto de entrenamiento
smote = SMOTE(random_state=42, n_jobs=-1)  # Usar todos los núcleos disponibles
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Liberar memoria
gc.collect()



76

In [4]:
# Definir los parámetros a explorar en la búsqueda de hiperparámetros
param_grid = {
    'boosting_type': ['gbdt', 'dart'],
    'num_leaves': [31, 50, 100],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'scale_pos_weight': [1, len(y_train_resampled) / sum(y_train_resampled)],  # Ajustar para el desbalance de clases
}

# Crear el modelo LightGBM
lgb_model = lgb.LGBMClassifier(random_state=42)

# Configurar la búsqueda de hiperparámetros usando RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_grid,
    n_iter=50,  # Aumentar el número de combinaciones a probar
    scoring='roc_auc',  # Optimizar para el AUC-ROC
    cv=3,  # Validación cruzada con 3 divisiones
    verbose=2,
    random_state=42,
    n_jobs=-1  # Usar todos los núcleos disponibles
)

# Ejecutar la búsqueda de hiperparámetros
random_search.fit(X_train_resampled, y_train_resampled)

# Mostrar los mejores hiperparámetros encontrados
best_params = random_search.best_params_
print("Best parameters found: ", best_params)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 8858, number of negative: 8858
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 17716, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best parameters found:  {'subsample': 1.0, 'scale_pos_weight': 1, 'num_leaves': 100, 'n_estimators': 300, 'min_child_samples': 20, 'max_depth': 20, 'learning_rate': 0.1, 'colsample_bytree': 0.6, 'boosting_type': 'gbdt'}


In [6]:
# Ajustar los mejores parámetros obtenidos
best_params = random_search.best_params_
best_params['objective'] = 'binary'  # Ajustar la función de pérdida a un problema binario
best_params['metric'] = 'auc'  # Usar AUC como métrica


In [7]:
# Crear los datasets de entrenamiento y validación
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

# Entrenar el modelo utilizando el conjunto de validación para early stopping
model = lgb.train(
    params=best_params,  # Parámetros del modelo
    train_set=dtrain,    # Conjunto de entrenamiento
    valid_sets=[dtrain, dval],   # Conjunto de validación para el early stopping
    num_boost_round=1000,  # Número máximo de iteraciones
    valid_names=['train', 'valid'],  # Nombres de los conjuntos para visualizar
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),  # Detener si no mejora en 50 iteraciones
        lgb.log_evaluation(period=10)  # Mostrar resultados cada 10 iteraciones
    ]
)



[LightGBM] [Info] Number of positive: 2947, number of negative: 8858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 11805, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249640 -> initscore=-1.100533
[LightGBM] [Info] Start training from score -1.100533
Training until validation scores don't improve for 50 rounds
[10]	train's auc: 0.591055	valid's auc: 0.486066
[20]	train's auc: 0.59438	valid's auc: 0.488194
[30]	train's auc: 0.595304	valid's auc: 0.488946
[40]	train's auc: 0.595743	valid's auc: 0.489071
[50]	train's auc: 0.595863	valid's auc: 0.489524
[60]	train's auc: 0.595945	valid's auc: 0.489917
[70]	train's auc: 0.595995	valid's auc: 0.490427
[80]	train's auc: 0.596027	valid's auc: 0.49039
[90]	train's auc: 0.596032	valid's auc: 0.490341
[100]	trai

In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Predecir probabilidades en el conjunto de validación
y_val_pred_proba = model.predict(X_val)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

# Calcular métricas
val_accuracy = accuracy_score(y_val, y_val_pred)
val_roc_auc = roc_auc_score(y_val, y_val_pred_proba)

print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation ROC AUC: {val_roc_auc}')
print(classification_report(y_val, y_val_pred))


Validation Accuracy: 0.7440914866581957
Validation ROC AUC: 0.49053551369918763
              precision    recall  f1-score   support

         0.0       0.74      1.00      0.85      2928
         1.0       0.00      0.00      0.00      1007

    accuracy                           0.74      3935
   macro avg       0.37      0.50      0.43      3935
weighted avg       0.55      0.74      0.63      3935



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Predecir probabilidades en el conjunto de prueba
y_test_pred_proba = model.predict(X_test)
y_test_pred = (y_test_pred_proba > 0.5).astype(int)

# Calcular métricas
test_accuracy = accuracy_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

print(f'Test Accuracy: {test_accuracy}')
print(f'Test ROC AUC: {test_roc_auc}')
print(classification_report(y_test, y_test_pred))


Test Accuracy: 0.7507621951219512
Test ROC AUC: 0.5000976247518416
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86      2955
         1.0       0.00      0.00      0.00       981

    accuracy                           0.75      3936
   macro avg       0.38      0.50      0.43      3936
weighted avg       0.56      0.75      0.64      3936



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
