In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, recall_score, accuracy_score, f1_score, precision_score


In [2]:
# Definiciones globales

%run ../Lib/pandas_options.py
%run ../Lib/define_constantes.py

In [3]:
# Cargar datos ya preprocesados y dividirlos en variables independientes (X) y variable dependiente objetivo (Y)

archivo_datos_codificados = f'{DIRECTORIO_DATOS_PREPROCESADOS}/DATOSCODIFICADOS.csv'
df_datos_codificados = pd.read_csv(archivo_datos_codificados, low_memory=False)

X = df_datos_codificados.drop('DETERIORADO', axis='columns')
Y = df_datos_codificados['DETERIORADO']

# print('\nVariables de entrada (X):')
# print(X.info())
# print('-' * 92)
# print('\nVariable dependiente (Y):')
# print(Y.info())
# print(Y.value_counts().sort_index())

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [5]:
model = xgb.XGBClassifier(eval_metric='logloss')
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}


In [6]:
# Crear el scorer para optimizar por recall
scorer = make_scorer(recall_score)

In [7]:
# Configurar GridSearchCV para optimizar el recall
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=10, verbose=5)

# Entrenar con la búsqueda de hiperparámetros
grid_search.fit(x_train, y_train)

# Mostrar los mejores hiperparámetros encontrados
print(f'Mejores hiperparámetros: {grid_search.best_params_}')

# Predecir usando el mejor modelo
best_model = grid_search.best_estimator_

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.512 total time=   0.0s
[CV 2/10] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.506 total time=   0.0s
[CV 3/10] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.522 total time=   0.0s
[CV 4/10] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.526 total time=   0.0s
[CV 5/10] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.536 total time=   0.0s
[CV 6/10] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.524 total time=   0.0s
[CV 7/10] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.507 total time=   0.0s
[CV 

KeyboardInterrupt: 

In [8]:
# Hacer predicciones
y_pred = best_model.predict(x_test)

In [9]:
# Evaluar el modelo con varias métricas
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

# Mostrar los resultados
print(f'Accuracy: {accuracy:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Precision: {precision:.4f}')

Accuracy: 0.7588
Recall: 0.7755
F1 Score: 0.7419
Precision: 0.7111
