In [1]:
import pandas as pd

from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import sys
import os

PROJECT_DIRECTORY = os.path.join(os.getcwd(), '..')
sys.path.append(PROJECT_DIRECTORY)

from src.features_processing import *
from src.model_evaluation import *

# Carregando Artefatos

In [2]:
AUX_VAR_PATH     = PROJECT_DIRECTORY + '/models/artefacts/aux_vars.pickle'
TARGET_VARS_PATH = PROJECT_DIRECTORY + '/models/artefacts/target_vars.pickle'
NUM_VARS_PATH    = PROJECT_DIRECTORY + '/models/artefacts/num_vars.pickle'
CAT_VARS_PATH    = PROJECT_DIRECTORY + '/models/artefacts/cat_vars.pickle'

CAT_IMPUTER_PATH = PROJECT_DIRECTORY + '/models/artefacts/cat_imputer.pickle'
NUM_IMPUTER_PATH = PROJECT_DIRECTORY + '/models/artefacts/num_imputer.pickle'

CAT_ENCONDER_PATH = PROJECT_DIRECTORY + '/models/artefacts/cat_encoder.pickle'
NUM_SCALER_PATH   = PROJECT_DIRECTORY + '/models/artefacts/num_scaler.pickle'

SELECTED_VARS_PATH = PROJECT_DIRECTORY + '/models/artefacts/selected_vars.pickle'


with open(AUX_VAR_PATH,'rb') as f:
    aux_vars = pickle.load(f)

with open(TARGET_VARS_PATH,'rb') as f:
    target_vars = pickle.load(f)

with open(SELECTED_VARS_PATH,'rb') as f:
    selected_vars = pickle.load(f)

# Carregando bases de treino e teste

In [3]:
abt_treino = pd.read_parquet(PROJECT_DIRECTORY + '/data/processed/abt_treino.parquet')
abt_teste = pd.read_parquet(PROJECT_DIRECTORY + '/data/processed/abt_teste.parquet')


abt_treino = tratamento_completo(df = abt_treino,
                                 aux_vars_path    = AUX_VAR_PATH,
                                 target_var_path  = TARGET_VARS_PATH,
                                 num_vars_path    = NUM_VARS_PATH,
                                 cat_vars_path    = CAT_VARS_PATH,
                                 cat_imputer_path = CAT_IMPUTER_PATH,
                                 num_imputer_path = NUM_IMPUTER_PATH,
                                 cat_encoder_path = CAT_ENCONDER_PATH,
                                 num_scaler_path  = NUM_SCALER_PATH)


abt_teste = tratamento_completo(df = abt_teste,
                                 aux_vars_path    = AUX_VAR_PATH,
                                 target_var_path  = TARGET_VARS_PATH,
                                 num_vars_path    = NUM_VARS_PATH,
                                 cat_vars_path    = CAT_VARS_PATH,
                                 cat_imputer_path = CAT_IMPUTER_PATH,
                                 num_imputer_path = NUM_IMPUTER_PATH,
                                 cat_encoder_path = CAT_ENCONDER_PATH,
                                 num_scaler_path  = NUM_SCALER_PATH)

# Preparando variaveis para treino e teste

In [4]:
X_train  = abt_treino[selected_vars]
y_train  = abt_treino[target_vars]

X_test = abt_teste[selected_vars]
y_test = abt_teste[target_vars]

# Definindo Modelos

In [5]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest':       RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42),
    'XGBoost':             XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM':            LGBMClassifier(n_estimators=100, n_jobs=-1, random_state=42)
}

# Cross Validation

In [6]:
# Configura o Stratified K-Fold sobre o treino
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Cross-validation dados de treino
cv_results = {name: {'ks':[], 'auc':[], 'gini':[]} for name in models}

for fold, (idx_tr, idx_val) in enumerate(skf.split(X_train, y_train), 1):
    X_tr, X_val = X_train.iloc[idx_tr], X_train.iloc[idx_val]
    y_tr, y_val = y_train.iloc[idx_tr], y_train.iloc[idx_val]
    
    for name, model in models.items():
        model.fit(X_tr, y_tr)
        y_prob = model.predict_proba(X_val)[:,1]
        ks, auc, gini = performance_metrics(y_val, y_prob)
        cv_results[name]['ks'].append(ks)
        cv_results[name]['auc'].append(auc)
        cv_results[name]['gini'].append(gini)

  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 1852, number of negative: 4716
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000597 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7866
[LightGBM] [Info] Number of data points in the train set: 6568, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.281973 -> initscore=-0.934695
[LightGBM] [Info] Start training from score -0.934695


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 1853, number of negative: 4716
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7874
[LightGBM] [Info] Number of data points in the train set: 6569, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.282083 -> initscore=-0.934155
[LightGBM] [Info] Start training from score -0.934155


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 1853, number of negative: 4716
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000601 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7861
[LightGBM] [Info] Number of data points in the train set: 6569, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.282083 -> initscore=-0.934155
[LightGBM] [Info] Start training from score -0.934155


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 1853, number of negative: 4716
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7869
[LightGBM] [Info] Number of data points in the train set: 6569, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.282083 -> initscore=-0.934155
[LightGBM] [Info] Start training from score -0.934155


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 1853, number of negative: 4716
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7867
[LightGBM] [Info] Number of data points in the train set: 6569, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.282083 -> initscore=-0.934155
[LightGBM] [Info] Start training from score -0.934155


In [7]:
# Agregando as métricas de CV
cv_summary = []
for name, m in cv_results.items():
    cv_summary.append({
        'Modelo':    name,
        'KS Médio':  np.mean(m['ks']),  'KS Desvio':   np.std(m['ks']),
        'AUC Médio': np.mean(m['auc']), 'AUC Desvio':  np.std(m['auc']),
        'Gini Médio':np.mean(m['gini']),'Gini Desvio': np.std(m['gini']),
    })
    
df_cv = pd.DataFrame(cv_summary).sort_values('KS Médio', ascending=False)

In [None]:
# Avaliação final no Teste
test_summary = []
for name, model in models.items():
    model.fit(X_train, y_train) 
    y_prob_test = model.predict_proba(X_test)[:,1]
    ks_t, auc_t, gini_t = performance_metrics(y_test, y_prob_test)
    test_summary.append({
        'Modelo':    name,
        'KS':       ks_t,
        'AUC':      auc_t,
        'Gini':     gini_t
    })
    
df_test = pd.DataFrame(test_summary).sort_values('AUC', ascending=False)

  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2316, number of negative: 5895
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7909
[LightGBM] [Info] Number of data points in the train set: 8211, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.282061 -> initscore=-0.934263
[LightGBM] [Info] Start training from score -0.934263


In [9]:
print("=== Resultados Cross-Validation - Treino ===")
display(df_cv)

print("\n=== Resultados Finais - Teste ===")
display(df_test)

=== Resultados Cross-Validation - Treino ===


Unnamed: 0,Modelo,KS Médio,KS Desvio,AUC Médio,AUC Desvio,Gini Médio,Gini Desvio
3,LightGBM,45.561987,2.370354,0.799063,0.012757,59.812548,2.551342
2,XGBoost,43.578859,2.575198,0.783847,0.012883,56.769418,2.576624
0,Logistic Regression,42.756065,1.405775,0.78218,0.010572,56.436069,2.114394
1,Random Forest,42.402667,0.492965,0.779617,0.006277,55.923376,1.255375



=== Resultados Finais - Teste ===


Unnamed: 0,Modelo,KS,AUC,Gini
3,LightGBM,30.950035,0.702397,40.479398
1,Random Forest,29.082638,0.698118,39.623505
2,XGBoost,28.540263,0.691322,38.264373
0,Logistic Regression,28.707148,0.689662,37.932327
