## Modelado Interbank

In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

> ### Carga de datos

In [2]:
train   = pd.read_csv('../input/pre-procesamiento-interbank/train-v20.gz', index_col = 'key_value')
y_train = pd.read_csv('../input/interbank20/y_train.csv', index_col = 'key_value').target
test    = pd.read_csv('../input/pre-procesamiento-interbank/test-v20.gz', index_col = 'key_value')

In [3]:
cat_feats = ['sexo', 
             'est_cvl', 
             'sit_lab', 
             'cod_ocu', 
             'flg_sin_email', 
             'lgr_vot', 
             'prv', 
             'dto', 
             'rgn', 
             'tip_lvledu', 
             'pred', 
             'flag_censo'
            ]

for j in cat_feats:
    train[j] = pd.Categorical(train[j])
    test[j]  = pd.Categorical(test[j])

In [4]:
nfolds = 5
folds  = [train.index[t] for t, v in KFold(nfolds).split(train)]

## Entrenamiento de Modelo

In [5]:
params = {
    'n_estimators'          : 5000,
    'learning_rate'         : 0.05,
    'min_child_samples'     : 2430,
    'min_split_gain'        : 0.750601181,
    'min_child_weight'      : 0.006158102,
    'bagging_fraction'      : 0.886441103, 
    'feature_fraction'      : 0.162094841,
    'metric'                : 'auc', 
    'verbose'               : -1,
    'early_stopping_rounds' : 150,
    'objective'             : 'binary',
}

In [6]:
%%time

train_probs = []
test_probs  = []

for i, idx in enumerate(folds):
    val_idx = train.drop(idx).index
    print('-'*53, f'Fold: {i + 1}')
    Xt = lgb.Dataset(train.loc[idx].values, 
                     label = y_train.loc[idx].values)
        
    Xv = lgb.Dataset(train.loc[val_idx].values, 
                     label = y_train.loc[val_idx].values)
                     
    learner = lgb.train(params, Xt, valid_sets = [Xv], verbose_eval = 100)
    
    train_probs.append(pd.Series(learner.predict(train.loc[val_idx].values), index = train.loc[val_idx].index, name = 'probs'))
    test_probs.append(pd.Series(learner.predict(test), index = test.index, name = 'fold_' + str(i)))

train_probs = pd.concat(train_probs)
test_probs = pd.concat(test_probs, axis = 1).mean(axis = 1)
score = roc_auc_score(y_train, train_probs.loc[y_train.index])
print(f"\nCV AUC Estimado: {score}")

----------------------------------------------------- Fold: 1
Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.850003
[200]	valid_0's auc: 0.857281
[300]	valid_0's auc: 0.859894
[400]	valid_0's auc: 0.861249
[500]	valid_0's auc: 0.862022
[600]	valid_0's auc: 0.862452
[700]	valid_0's auc: 0.862629
[800]	valid_0's auc: 0.862827
[900]	valid_0's auc: 0.862832
[1000]	valid_0's auc: 0.862857
Early stopping, best iteration is:
[917]	valid_0's auc: 0.862885
----------------------------------------------------- Fold: 2
Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.848775
[200]	valid_0's auc: 0.856717
[300]	valid_0's auc: 0.859848
[400]	valid_0's auc: 0.861561
[500]	valid_0's auc: 0.862354
[600]	valid_0's auc: 0.862897
[700]	valid_0's auc: 0.863154
[800]	valid_0's auc: 0.863317
[900]	valid_0's auc: 0.863323
[1000]	valid_0's auc: 0.86343
[1100]	valid_0's auc: 0.863423
[1200]	valid_0's auc: 0.863434
Early stopping, best ite

In [7]:
test_probs.name = 'target'
test_probs.to_csv('benchmark_t.csv')