# Set root

In [1]:
import os
import re

root = re.findall('.*predict_bad_rate', os.getcwd())[0]
os.chdir(root)

# Leer YAML

In [2]:
import yaml

with open('params.yaml', 'r') as file:
    yaml_config = yaml.safe_load(file)

## XGBoost Optimization

Notebook to optimize an XGBoost model

In [7]:
%env WANDB_SILENT=True
import wandb

sweep_config = {
    'name': 'RL_centralizado',
    'method': 'bayes',
    'metric':{
        'name':'mean_average_precision_val',
        'goal':'maximize'
    },
    'parameters':{
        'max_iter':{
            'min':100,
            'max':1000
        },
        'C':{
            'min':1e-4,
            'max':1e+2
        },
        'penalty':{
            'values':['l1', 'l2']
        },
        'solver':{
            'values':['lbfgs', 'liblinear','saga']
        },
        'scaler': {
            'values': ['standard', 'robust']
        },
        'imputer': {
            'values': ['simple', 'knn']
        },
        'imputer_strategy': {
            'values': ['mean', 'median']
        },
        'knn_imputer_k':{
            'min':3,
            'max':15
        },
        'pca_components':{
            'min':2,
            'max':15
        },
        'variance_threshold':{
            'min':0.0,
            'max':0.1
        }
    }
}

sweep_id = wandb.sweep(sweep_config, entity='danipipe777', project='project_ICESI_I')

env: WANDB_SILENT=True
Create sweep with ID: kr114moy
Sweep URL: https://wandb.ai/danipipe777/project_ICESI_I/sweeps/kr114moy


In [24]:
import sys
sys.path.append('../../')

In [31]:
import package.bna.metrics.ks

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.metrics import make_scorer
import package.bna.metrics.ks as ks_metrics
import sklearn.metrics as skm
from copy import deepcopy
from time import time
from importlib import reload
import os
import numpy as np
import pandas as pd
import joblib as jl
import pickle as pickle


## Carga de Datos

In [10]:
X = pd.read_parquet(yaml_config['centralizado']['folder_processed']+
                    'buro_centralizado_from_data_preparation_with_external_variables.parquet'
)
X.sort_values('fecha', inplace=True)

y = pd.read_parquet(yaml_config['centralizado']['folder_processed']+
                    'maxima_mora_alcanzada_buro_centralizado_with_recent_bureau.parquet'
).squeeze()

y = y.loc[X.index]
y = y.ge(60).astype(int)

## Definición de tipos de Variables

Variables que sean más del 70% nulas no serán consideradas inicialmente. También se definen qué variables son categóricas, ordinales y qué variables son numerical.

La única variable que no entra dentro de estas categorías es `fecha` fecha de consulta de buró de crédito.

In [16]:
porcetaje_de_nulidad = (
    X.isnull()
    .apply(lambda s: s.value_counts(True)).T
) 

porcetaje_de_nulidad.columns = ['not_null', 'null']
variables_muy_nulas = porcetaje_de_nulidad.query('null > 0.5').index

# Separación de Variables según tipo de datos

In [18]:
variables_categoricas_y_ordinales = [
    'riesgo_6', 'riesgo_12', 'ip', 'riesgo_12_bis',
    'peor_sit_bcra_ult', 'peor_sit_bureau_ult', 'sit_irregular'
]

X[variables_categoricas_y_ordinales] = \
    X[variables_categoricas_y_ordinales].astype('category')

ordinales = pd.Index(['ip'])
categoricas = X.select_dtypes('category').columns
categoricas = categoricas.delete(
    categoricas.isin(ordinales) |
    categoricas.isin(variables_muy_nulas)|
    categoricas.isin(['h3_id'])
)

numericas = X.select_dtypes('number').columns
numericas = numericas.delete(
    numericas.isin(variables_muy_nulas)
)

## Split de los datos

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, shuffle=False, test_size=0.3
)

## Definición de configuraciones del pipeline

In [33]:
config_ordinal = dict(
    handle_unknown='use_encoded_value',
    unknown_value=np.nan
)

config_onehot = dict(
    handle_unknown='ignore'
)

ks_scorer = make_scorer(
    ks_metrics.ks_score,
    needs_proba=True,
    greater_is_better=True
)

scoring = {
    'roc_auc':'roc_auc',
    'balanced_accuracy':'balanced_accuracy',
    'average_precision':'average_precision',
    'ks':ks_scorer
}

## Barrido de híperparámetros

In [None]:

global config
def train():
    with wandb.init(tags=['RL', 'external', 'pca']) as run:
        
        global config
        
        scaler_cls = StandardScaler if wandb.config['scaler']=='standard' else RobustScaler
        imputer_strategy = "mean" if wandb.config['imputer_strategy'] =='mean' else "median"
        if wandb.config['imputer'] == 'knn':
            imputer_cls = KNNImputer(n_neighbors=int(wandb.config['knn_imputer_k']))
        else:
            imputer_cls = SimpleImputer(strategy=imputer_strategy)
        
        numeric_transformer = Pipeline(
            steps=[("imputer", imputer_cls), 
                   ("scaler", scaler_cls()),  
                   ("select_var",VarianceThreshold(wandb.config['variance_threshold']))
                   ,("pca", PCA(n_components=int(wandb.config['pca_components'])))
                   ]
        )
    
        categorical_transformer = Pipeline(
            steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                   ('dumm',OneHotEncoder(**config_onehot))
                  ]
        )
        
        ordinal_transformer =  Pipeline(
            steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                   ('encod',OrdinalEncoder(**config_ordinal))
                  ]
        )
        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numericas),
                ("ord", ordinal_transformer, ordinales),
                ("cat", categorical_transformer, categoricas),
            ]
        )
    
       
        pipeline = Pipeline([
            ('preprocesamiento', preprocessor),
            ('classificador', LogisticRegression(
                C=wandb.config['C'],
                penalty=wandb.config['penalty'],
                max_iter=wandb.config['max_iter'],
                solver=wandb.config['solver'],
                random_state=10
            ))
        ])
        
        
        ## cross-validation
        resultados_cv = cross_validate(
            pipeline, X_train, y_train,
            scoring=scoring, cv=5, verbose=0, n_jobs=-1,
            return_train_score=True
        )
        
        ## Test results
        pipeline.fit(X_train, y_train)
        y_pred_proba_train = pipeline.predict_proba(X_train)[:, 1]
        y_pred_proba_test = pipeline.predict_proba(X_test)[:, 1]
        
        ks_train = ks_metrics.ks_score(y_train, y_pred_proba_train)
        ks_test = ks_metrics.ks_score(y_test, y_pred_proba_test)

        wandb.log({
            'mean_balanced_accuracy_val':np.nanmean(resultados_cv['test_balanced_accuracy']),
            'mean_roc_auc_val':np.nanmean(resultados_cv['test_roc_auc']),
            'mean_ks_val':np.nanmean(resultados_cv['test_ks']),
            'mean_ks_train':np.nanmean(resultados_cv['train_ks']),
            'mean_average_precision_val':np.nanmean(resultados_cv['test_average_precision']),
            'mean_average_precision_train':np.nanmean(resultados_cv['train_average_precision']),
            'average_precision_test':skm.average_precision_score(y_test, y_pred_proba_test),
            'roc_auc_test':skm.roc_auc_score(y_test, y_pred_proba_test),
            'ks_train':ks_train,
            'ks_test':ks_test
            
        })
        
#         model_path = f'../../models/RL_centralizado/{run.name}'
#         os.mkdir(model_path)
#         jl.dump(xgb_pipeline, os.path.join(model_path, f'{run.name}.joblib'))
        
count = 100
wandb.agent(
    sweep_id, function=train,
    count=count, entity='danipipe777', project='project_ICESI_I'
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666899498, max=1.0…



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666899498, max=1.0…



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666899498, max=1.0…



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666899498, max=1.0…

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.017183333332650364, max=1.0…

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333332417533, max=1.0…

