# Entrenamiento ML balanceado

En este cuaderno se exploran los modelos de machine learning a implementar para definir el de mejor ajuste. Se analizan dos escenarios: con el conjunto de dato balanceada con over sampling y con el conjunto de datos balanceado con under sampling.

In [1]:
import os
import sys
import json

import numpy as np

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import(
    StratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV
)

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier


home_path = os.path.dirname(os.getcwd())
sys.path.append(home_path)

source_path = os.path.join(os.path.dirname(os.getcwd()),'src')
sys.path.append(source_path)

from src.model_management import ClasificadorMora, obtener_threshold, evaluar_clasificador


path_etiquetas = os.path.join(home_path, 'data', 'etiquetas.csv')
path_info_clientes = os.path.join(home_path, 'data', 'informacion_clientes.csv')
path_hist_transacciones = os.path.join(home_path, 'data', 'historial_transacciones.csv')

## Over sample

In [2]:
clasificador = ClasificadorMora(
    path_etiquetas,
    path_info_clientes,
    path_hist_transacciones,
    balance_strategy="over_sample" 
)

X_train, X_test, y_train, y_test = clasificador.preprocess()

eval_metrics = {
    "rmse":"neg_root_mean_squared_error",
    "accuracy":"accuracy",
    "recall":"recall"
}

### Support Vector Machine

In [12]:

svm_os_base_model = SVC()
svm_os_params = {
    "C":[0.01,0.1,1.,10.],    
    "kernel":["poly", "rbf", "sigmoid"]
}
svm_os_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

svm_os_tunning = GridSearchCV(
    estimator=svm_os_base_model,
    param_grid=svm_os_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=svm_os_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

svm_os_tunning.fit(X_train,y_train)

svm_os_parameters = svm_os_tunning.best_params_
svm_os_parameters_path = os.path.join(source_path,'parameters','svm_os_parameters.json')
with open(svm_os_parameters_path, 'w') as out_file:
    json.dump(svm_os_parameters, out_file)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


In [13]:
svm_os_model = SVC(random_state=1, probability=True,**svm_os_parameters)
svm_os_model.fit(X_train, y_train)

y_proba = svm_os_model.predict_proba(X_test)[:,1].reshape(-1,)
svm_os_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > svm_os_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.42377273783637065
   - Accuracy: 0.8204166666666667
   - recall: 0.3930635838150289


(0.42377273783637065, 0.8204166666666667, 0.3930635838150289)

### Random Forest

In [3]:
rf_os_base_model = RandomForestClassifier()
rf_os_params = {
    "criterion":["gini", "entropy", "log_loss"],    
    "max_features":["sqrt", "log2"],
    "n_estimators":[100,200]
}
rf_os_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

rf_os_tunning = GridSearchCV(
    estimator=rf_os_base_model,
    param_grid=rf_os_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=rf_os_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

rf_os_tunning.fit(X_train,y_train)

rf_os_parameters = rf_os_tunning.best_params_

rf_os_parameters = rf_os_tunning.best_params_
rf_os_parameters_path = os.path.join(source_path,'parameters','rf_os_parameters.json')
with open(rf_os_parameters_path, 'w') as out_file:
    json.dump(rf_os_parameters, out_file)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


In [4]:
rf_os_model = RandomForestClassifier(random_state=1,**rf_os_parameters)
rf_os_model.fit(X_train, y_train)

y_proba = rf_os_model.predict_proba(X_test)[:,1].reshape(-1,)
rf_os_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > rf_os_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.42841763113423176
   - Accuracy: 0.8164583333333333
   - recall: 0.31888246628131023


(0.42841763113423176, 0.8164583333333333, 0.31888246628131023)

## Histogram based boosting

In [5]:
hb_os_base_model = HistGradientBoostingClassifier()
hb_os_params = {
    "learning_rate":[0.01,0.1,0.2,0.3],    
    "max_iter":[10,100,200,500,1000],
    "max_leaf_nodes":[10,20,30,50]
}
hb_os_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

hb_os_tunning = GridSearchCV(
    estimator=hb_os_base_model,
    param_grid=hb_os_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=hb_os_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

hb_os_tunning.fit(X_train,y_train)

hb_os_parameters = hb_os_tunning.best_params_

hb_os_parameters = hb_os_tunning.best_params_
hb_os_parameters_path = os.path.join(source_path,'parameters','hb_os_parameters.json')
with open(hb_os_parameters_path, 'w') as out_file:
    json.dump(hb_os_parameters, out_file)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [6]:
hb_os_model = HistGradientBoostingClassifier(random_state=1,**hb_os_parameters)
hb_os_model.fit(X_train, y_train)

y_proba = hb_os_model.predict_proba(X_test)[:,1].reshape(-1,)
hb_os_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > hb_os_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.4235268586524354
   - Accuracy: 0.820625
   - recall: 0.29190751445086704


(0.4235268586524354, 0.820625, 0.29190751445086704)

### Extreme Gradient Boosting

In [7]:
xgb_os_base_model = XGBClassifier(objective="binary:logistic")
xgb_os_params = {
    "max_depth":[3,5,10,15,20],    
    "learning_rate":[0.01,0.1,0.2],
    "colsample_bytree":np.arange(0.4,1.0,0.1),
    "colsample_bylevel":np.arange(0.4,1.0,0.1),
    "subsample":np.arange(0.5,1.0,0.1),
    "n_estimators":[100,500]
}
xgb_os_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

xgb_os_tunning = RandomizedSearchCV(
    estimator=xgb_os_base_model,
    param_distributions=xgb_os_params,
    scoring=eval_metrics,
    n_iter=25,
    n_jobs=8,
    cv=xgb_os_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

xgb_os_tunning.fit(X_train,y_train)

xgb_os_parameters = xgb_os_tunning.best_params_

xgb_os_parameters = xgb_os_tunning.best_params_
xgb_os_parameters_path = os.path.join(source_path,'parameters','xgb_os_parameters.json')
with open(xgb_os_parameters_path, 'w') as out_file:
    json.dump(xgb_os_parameters, out_file)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


In [8]:
xgb_os_model = XGBClassifier(objective="binary:logistic",random_state=1,**xgb_os_parameters)
xgb_os_model.fit(X_train, y_train)

y_proba = xgb_os_model.predict_proba(X_test)[:,1].reshape(-1,)
xgb_os_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > xgb_os_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.4306003560921271
   - Accuracy: 0.8145833333333333
   - recall: 0.2861271676300578


(0.4306003560921271, 0.8145833333333333, 0.2861271676300578)

## Under sample

In [9]:
clasificador = ClasificadorMora(
    path_etiquetas,
    path_info_clientes,
    path_hist_transacciones,
    balance_strategy="under_sample" 
)

X_train, X_test, y_train, y_test = clasificador.preprocess()

eval_metrics = {
    "rmse":"neg_root_mean_squared_error",
    "accuracy":"accuracy",
    "recall":"recall"
}

### Random Forest

In [14]:
rf_us_base_model = RandomForestClassifier()
rf_us_params = {
    "criterion":["gini", "entropy", "log_loss"],    
    "max_features":["sqrt", "log2"],
    "n_estimators":[100,200,1000]
}
rf_us_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

rf_us_tunning = GridSearchCV(
    estimator=rf_us_base_model,
    param_grid=rf_us_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=rf_us_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

rf_us_tunning.fit(X_train,y_train)

rf_us_parameters = rf_us_tunning.best_params_

rf_us_parameters = rf_us_tunning.best_params_
rf_us_parameters_path = os.path.join(source_path,'parameters','rf_us_parameters.json')
with open(rf_us_parameters_path, 'w') as out_file:
    json.dump(rf_us_parameters, out_file)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


In [15]:
rf_us_model = RandomForestClassifier(random_state=1,**rf_us_parameters)
rf_us_model.fit(X_train, y_train)

y_proba = rf_us_model.predict_proba(X_test)[:,1].reshape(-1,)
rf_us_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > rf_us_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.42180169116145877
   - Accuracy: 0.8220833333333334
   - recall: 0.35452793834296725


(0.42180169116145877, 0.8220833333333334, 0.35452793834296725)

### Histogram based boosting

In [16]:
hb_us_base_model = HistGradientBoostingClassifier()
hb_us_params = {
    "learning_rate":[0.01,0.1,0.2,0.3],    
    "max_iter":[10,100,200,500,1000],
    "max_leaf_nodes":[10,20,30,50]
}
hb_us_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

hb_us_tunning = GridSearchCV(
    estimator=hb_us_base_model,
    param_grid=hb_us_params,
    scoring=eval_metrics,
    n_jobs=8,
    cv=hb_us_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

hb_us_tunning.fit(X_train,y_train)

hb_us_parameters = hb_us_tunning.best_params_

hb_us_parameters = hb_us_tunning.best_params_
hb_us_parameters_path = os.path.join(source_path,'parameters','hb_us_parameters.json')
with open(hb_us_parameters_path, 'w') as out_file:
    json.dump(hb_us_parameters, out_file)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [17]:
hb_us_model = HistGradientBoostingClassifier(random_state=1,**hb_us_parameters)
hb_us_model.fit(X_train, y_train)

y_proba = hb_us_model.predict_proba(X_test)[:,1].reshape(-1,)
hb_us_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > hb_us_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.4173328008516305
   - Accuracy: 0.8258333333333333
   - recall: 0.37957610789980734


(0.4173328008516305, 0.8258333333333333, 0.37957610789980734)

### Extreme Gradient Boosting

In [10]:
xgb_us_base_model = XGBClassifier(objective="binary:logistic")
xgb_us_params = {
    "max_depth":[3,5,10,15,20],    
    "learning_rate":[0.01,0.1,0.2],
    "colsample_bytree":np.arange(0.4,1.0,0.1),
    "colsample_bylevel":np.arange(0.4,1.0,0.1),
    "subsample":np.arange(0.5,1.0,0.1),
    "n_estimators":[100,500]
}
xgb_us_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

xgb_us_tunning = RandomizedSearchCV(
    estimator=xgb_us_base_model,
    param_distributions=xgb_us_params,
    scoring=eval_metrics,
    n_iter=25,
    n_jobs=8,
    cv=xgb_us_skf.split(X_train,y_train),
    refit="rmse",
    verbose=1
)

xgb_us_tunning.fit(X_train,y_train)

xgb_us_parameters = xgb_us_tunning.best_params_

xgb_us_parameters = xgb_us_tunning.best_params_
xgb_us_parameters_path = os.path.join(source_path,'parameters','xgb_us_parameters.json')
with open(xgb_us_parameters_path, 'w') as out_file:
    json.dump(xgb_us_parameters, out_file)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


In [11]:
xgb_us_model = XGBClassifier(objective="binary:logistic",random_state=1,**xgb_us_parameters)
xgb_us_model.fit(X_train, y_train)

y_proba = xgb_us_model.predict_proba(X_test)[:,1].reshape(-1,)
xgb_us_threshold = obtener_threshold(y_test, y_proba,mean_squared_error, greater_is_better=False,**{"squared":False})

y_pred = np.where(y_proba > xgb_us_threshold, 1., 0.)
evaluar_clasificador(y_test,y_pred)

Se obtienen las metricas:
   - RMSE: 0.4143267631552018
   - Accuracy: 0.8283333333333334
   - recall: 0.3805394990366089


(0.4143267631552018, 0.8283333333333334, 0.3805394990366089)