# Entrenamiento ML sin balancear

En este cuaderno se exploran los modelos de machine learning a implementar para definir el de mejor ajuste. Se analiza el escenario con el conjunto de datos sin balancear.

In [1]:
import os
import sys
import json

import numpy as np

from typing import Optional, Any

from sklearn.metrics import (
    mean_squared_error, 
    accuracy_score, 
    recall_score
)

from sklearn.model_selection import(
    StratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV
)

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import(
    RandomForestClassifier, HistGradientBoostingClassifier, AdaBoostClassifier
)
from xgboost import XGBClassifier


home_path = os.path.dirname(os.getcwd())
sys.path.append(home_path)

source_path = os.path.join(os.path.dirname(os.getcwd()),'src')
sys.path.append(source_path)

from src.model_management import ClasificadorMora

path_etiquetas = os.path.join(home_path, 'data', 'etiquetas.csv')
path_info_clientes = os.path.join(home_path, 'data', 'informacion_clientes.csv')
path_hist_transacciones = os.path.join(home_path, 'data', 'historial_transacciones.csv')



In [2]:
def obtener_threshold(
        y_true:np.ndarray, 
        y_proba:np.ndarray,
        metric:Optional[Any]=accuracy_score,
        greater_is_better:Optional[bool]=False,
        **kwargs
    ):
    
    """
    Función que permite hallar el threshold óptimo para maximizar una métrica de clasificación.

    Args:
    ----------
    y_true:np.ndarray
        Arreglo unidimensional con los valores reales de las etiquetas
    y_proba:np.ndarray
        Arreglo unidimensional con los valores predichos para la probabilidad de la etiqueta 1.
    metric:Optional[Any]=accuracy_score
        Métrica a optimizar con la selección del threshold
    greater_is_better:Optional[bool]=False
        Indicador de si la metrica debe maximizarse o minimizarse
    """

    grid = np.arange(0.0,1.01,0.01)

    if greater_is_better:
        t_index = np.argmax(list(map(
            lambda threshold: metric(y_true,np.where(y_proba >= threshold, 1.0, 0.0), **kwargs), grid
        )))
    else:
        t_index = np.argmin(list(map(
            lambda threshold: metric(y_true,np.where(y_proba >= threshold, 1.0, 0.0), **kwargs), grid
        )))

    threshold = grid[t_index]

    return threshold

In [3]:
def evaluar_clasificador(y_true:np.ndarray,y_pred:np.ndarray,verbose:Optional[bool]=True):
    """
    Función que arroja métricas y gráficas de evaluación de la clasificación

    Args:
    ----------
    y_true:np.ndarray
        Arreglo unidimensional con los valores reales de las etiquetas
    y_pred:np.ndarray
        Arreglo unidimensional con los valores predichos para la probabilidad de la etiqueta 1.
    verbose:bool
        Define si deben imprimirse las metricas en pantalla

    Results:
        (float,float,float)
        Valores de las tres métricas
    """
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    if verbose:
        print(f"Se obtienen las metricas:\n   - RMSE: {rmse}\n   - Accuracy: {accuracy}\n   - recall: {recall}")
    return rmse,accuracy,recall

# Datos desbalanceados

In [4]:
clasificador = ClasificadorMora(
    path_etiquetas,
    path_info_clientes,
    path_hist_transacciones    
)

X_train, X_test, y_train, y_test = clasificador.preprocess()

eval_metrics = {
    "rmse":"neg_root_mean_squared_error",
    "accuracy":"accuracy",
    "recall":"recall"
}


## Regresión Logística



In [53]:

rl_base_model = LogisticRegression()
rl_params = {
    "penalty":['none', "l1", "l2", "elacsticnet"],
    "C":[0.01,0.1,1.,10.,100.],
    "solver":["lbfgs","saga"]
}
rl_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

rl_tunning = GridSearchCV(
    estimator=rl_base_model,
    param_grid=rl_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=rl_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

rl_tunning.fit(X_train,y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


150 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\Python\Environments\tensorflow_3.9\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\Python\Environments\tensorflow_3.9\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\User\Python\Environments\tensorflow_3.9\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports onl

In [92]:
rl_parameters = rl_tunning.best_params_

rl_model = LogisticRegression(**rl_parameters)
rl_model.fit(X_train, y_train)

y_proba = rl_model.predict_proba(X_test)[:,1].reshape(-1,)
rl_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > rl_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.4200694387042853
   - Accuracy: 0.8235416666666666
   - recall: 0.36416184971098264




(0.4200694387042853, 0.8235416666666666, 0.36416184971098264)

## Discriminante cuadrático

In [93]:
dc_model = QuadraticDiscriminantAnalysis()
dc_model.fit(X_train, y_train)

y_proba = dc_model.predict_proba(X_test)[:,1].reshape(-1,)
dc_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > dc_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.46502688094345684
   - Accuracy: 0.78375
   - recall: 0.0




(0.46502688094345684, 0.78375, 0.0)

## Support Vector Machine

In [27]:

svm_base_model = SVC()
svm_params = {
    "C":[0.01,0.1,1.,10.],    
    "kernel":["poly", "rbf", "sigmoid"]
}
svm_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

svm_tunning = GridSearchCV(
    estimator=svm_base_model,
    param_grid=svm_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=svm_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

svm_tunning.fit(X_train,y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


In [28]:

svm_parameters = svm_tunning.best_params_
svm_parameters_path = os.path.join(source_path,'parameters','svm_parameters.json')
with open(svm_parameters_path, 'w') as out_file:
    json.dump(svm_parameters, out_file)

In [30]:
svm_model = SVC(random_state=1, probability=True,**svm_parameters)
svm_model.fit(X_train, y_train)

y_proba = svm_model.predict_proba(X_test)[:,1].reshape(-1,)
svm_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > svm_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.4170831252080733
   - Accuracy: 0.8260416666666667
   - recall: 0.3631984585741811


(0.4170831252080733, 0.8260416666666667, 0.3631984585741811)

## K Neighbors Classifier

In [20]:

knn_base_model = KNeighborsClassifier()
knn_params = {
    "n_neighbors":[5,7,9,11,13,15],    
    "weights":["uniform", "distance"],
    "leaf_size":[10,20,30,50]
}
knn_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

knn_tunning = GridSearchCV(
    estimator=knn_base_model,
    param_grid=knn_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=knn_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

knn_tunning.fit(X_train,y_train)

knn_parameters = knn_tunning.best_params_

knn_parameters = knn_tunning.best_params_
knn_parameters_path = os.path.join(source_path,'parameters','knn_parameters.json')
with open(knn_parameters_path, 'w') as out_file:
    json.dump(knn_parameters, out_file)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [22]:

knn_model = KNeighborsClassifier(**knn_parameters)
knn_model.fit(X_train, y_train)

y_proba = knn_model.predict_proba(X_test)[:,1].reshape(-1,)
knn_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > knn_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.42866070498705616
   - Accuracy: 0.81625
   - recall: 0.2610789980732177


(0.42866070498705616, 0.81625, 0.2610789980732177)

## Random forest

In [25]:
rf_base_model = RandomForestClassifier()
rf_params = {
    "criterion":["gini", "entropy", "log_loss"],    
    "max_features":["sqrt", "log2"]
}
rf_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

rf_tunning = GridSearchCV(
    estimator=rf_base_model,
    param_grid=rf_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=rf_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

rf_tunning.fit(X_train,y_train)

rf_parameters = rf_tunning.best_params_

rf_parameters = rf_tunning.best_params_
rf_parameters_path = os.path.join(source_path,'parameters','rf_parameters.json')
with open(rf_parameters_path, 'w') as out_file:
    json.dump(rf_parameters, out_file)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


In [33]:
rf_model = RandomForestClassifier(random_state=101,**rf_parameters)
rf_model.fit(X_train, y_train)

y_proba = rf_model.predict_proba(X_test)[:,1].reshape(-1,)
rf_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > rf_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.41907636535600523
   - Accuracy: 0.824375
   - recall: 0.3786127167630058


(0.41907636535600523, 0.824375, 0.3786127167630058)

## Adaptative boosting

In [12]:
ab_base_model = AdaBoostClassifier()
ab_params = {
    "learning_rate":[0.01,0.1,0.2,0.3],    
    "n_estimators":[10,50,100,250,500,1000]
}
ab_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

ab_tunning = GridSearchCV(
    estimator=ab_base_model,
    param_grid=ab_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=ab_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

ab_tunning.fit(X_train,y_train)

ab_parameters = ab_tunning.best_params_

ab_parameters = ab_tunning.best_params_
ab_parameters_path = os.path.join(source_path,'parameters','ab_parameters.json')
with open(ab_parameters_path, 'w') as out_file:
    json.dump(ab_parameters, out_file)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [13]:
ab_model = AdaBoostClassifier(random_state=1,**ab_parameters)
ab_model.fit(X_train, y_train)

y_proba = ab_model.predict_proba(X_test)[:,1].reshape(-1,)
ab_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > ab_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.4183300132670378
   - Accuracy: 0.825
   - recall: 0.3092485549132948


(0.4183300132670378, 0.825, 0.3092485549132948)

## Histogram based Gradient Boosting

In [10]:
hb_base_model = HistGradientBoostingClassifier()
hb_params = {
    "learning_rate":[0.01,0.1,0.2,0.3],    
    "max_iter":[10,100,200,500,1000],
    "max_leaf_nodes":[10,20,30,50]
}
hb_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

hb_tunning = GridSearchCV(
    estimator=hb_base_model,
    param_grid=hb_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=hb_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

hb_tunning.fit(X_train,y_train)

hb_parameters = hb_tunning.best_params_

hb_parameters = hb_tunning.best_params_
hb_parameters_path = os.path.join(source_path,'parameters','hb_parameters.json')
with open(hb_parameters_path, 'w') as out_file:
    json.dump(hb_parameters, out_file)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [11]:
hb_model = HistGradientBoostingClassifier(random_state=1,**hb_parameters)
hb_model.fit(X_train, y_train)

y_proba = hb_model.predict_proba(X_test)[:,1].reshape(-1,)
hb_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > hb_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.41482928215512144
   - Accuracy: 0.8279166666666666
   - recall: 0.3872832369942196


(0.41482928215512144, 0.8279166666666666, 0.3872832369942196)

## Extreeme Gradient Boosting

In [14]:
xgb_base_model = XGBClassifier(objective="binary:logistic")
xgb_params = {
    "max_depth":[3,5,10,15,20],    
    "learning_rate":[0.01,0.1,0.2],
    "colsample_bytree":np.arange(0.4,1.0,0.1),
    "colsample_bylevel":np.arange(0.4,1.0,0.1),
    "subsample":np.arange(0.5,1.0,0.1),
    "n_estimators":[100,500]
}
xgb_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

xgb_tunning = RandomizedSearchCV(
    estimator=xgb_base_model,
    param_distributions=xgb_params,
    scoring=eval_metrics,
    n_iter=25,
    n_jobs=8,
    cv=xgb_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

xgb_tunning.fit(X_train,y_train)

xgb_parameters = xgb_tunning.best_params_

xgb_parameters = xgb_tunning.best_params_
xgb_parameters_path = os.path.join(source_path,'parameters','xgb_parameters.json')
with open(xgb_parameters_path, 'w') as out_file:
    json.dump(xgb_parameters, out_file)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


In [35]:
xgb_model = XGBClassifier(objective="binary:logistic",random_state=1,**xgb_parameters)
xgb_model.fit(X_train, y_train)

y_proba = xgb_model.predict_proba(X_test)[:,1].reshape(-1,)
xgb_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > xgb_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.4128155358833612
   - Accuracy: 0.8295833333333333
   - recall: 0.3930635838150289


(0.4128155358833612, 0.8295833333333333, 0.3930635838150289)