---
# Notebook para selecionarmos o melhor modelo, para que no próximo passo façamos o tunning do melhor modelo.

Vamos testar 3 abordagens de modelos:

- __Logistic Regression__

- __Random Forest__

- __Boosting__

---

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import pickle
import warnings
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    recall_score,
    precision_score
)
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from src.utils.features_manager import get_features_by_property

# Configs Pandas
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 200

warnings.filterwarnings("ignore")

In [2]:
project_root = Path().resolve().parents[0]

In [3]:
df_train = pd.read_parquet(os.path.join(project_root, "data", "processed", "train.parquet"))

In [4]:
features_config_path = os.path.join(project_root, "src", "data", "config", "features.yaml")

# Load Encoders

In [5]:
def load_encoder(encoder):
    with open(os.path.join(project_root, "models", "encoders", f"{encoder}.pkl"), "rb") as f:
        encoder = pickle.load(f)
    return encoder

In [6]:
# feature_type = load_encoder("feature_type")
fill_numeric = load_encoder("fill_numeric")
string_encoder = load_encoder("string_encoder")
selector = load_encoder("selector")

## Gerando selector_2 com features selecionadas pelo Boruta + RFECV

In [7]:
selection_boruta = get_features_by_property(features_config_path, "selected_by_boruta")
selection_rfecv = get_features_by_property(features_config_path, "selected_by_rfecv")

## Aplicando encoders que não dependem da amostragem

In [8]:
df_train = fill_numeric.transform(df_train)

# Separando conjunto de validação

In [9]:
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=911, stratify=df_train["fraude"])
print(f"Shape train: {df_train.shape} | # {df_train['fraude'].sum()} fraudes | Bad rate: {df_train['fraude'].mean():.2%}")
print(f"Shape val: {df_val.shape} | # {df_val['fraude'].sum()} fraudes | Bad rate: {df_val['fraude'].mean():.2%}")

Shape train: (103055, 38) | # 5336 fraudes | Bad rate: 5.18%
Shape val: (11451, 38) | # 593 fraudes | Bad rate: 5.18%


# Baseline

In [10]:
def generate_model_metrics(y_test, y_pred_proba, max_fpr=0.05):
    """Model performance metrics"""
    
    # Calculate ROC AUC score with a partial area under the curve
    roc_auc_partial = float(roc_auc_score(y_test, y_pred_proba, max_fpr=max_fpr))
    
    # Calculate recall and precision with a threshold based on max_fpr
    threshold = np.percentile(y_pred_proba, 100 * (1 - max_fpr))
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    recall_partial = recall_score(y_test, y_pred)
    precision_partial = precision_score(y_test, y_pred)
    
    metrics = {        
        f'ROC_AUC@{max_fpr}': roc_auc_partial,
        f'Recall@{max_fpr}': recall_partial,
        f'Precision@{max_fpr}': precision_partial
    }
    
    return metrics

In [11]:
df_raw = pd.read_parquet(os.path.join(project_root, "data", "raw", "dados.parquet"))
df_baseline = df_train[['index', 'fraude', 'week_of_the_year']].merge(df_raw[['index', 'score']], left_on="index", right_on="index")

In [12]:
baseline_metrics = generate_model_metrics(df_baseline['fraude'], df_baseline['score'], max_fpr=0.05)
print(baseline_metrics)

{'ROC_AUC@0.05': 0.5658434505779224, 'Recall@0.05': 0.24756371814092953, 'Precision@0.05': 0.23699318263365626}


# Criando Modelos

In [13]:
# Definindo métrica de avaliação
def partial_roc_auc(y_true, y_score, max_fpr=0.05):
    return roc_auc_score(y_true, y_score, max_fpr=max_fpr)

partial_roc_auc_scorer = make_scorer(partial_roc_auc, needs_proba=True)

In [14]:
def tune_model(model, param_distributions, X_train, y_train, scaler=False, n_iter=40, cv=3, scoring=partial_roc_auc_scorer, random_state=911, verbose=1):
    
    if scaler:
        pipeline = Pipeline(steps=[
            ('string_encoder', string_encoder),
            ('selector', selector),
            ('scaler', StandardScaler()),
            ('imputer', SimpleImputer(strategy='mean')), # FIX: scaler estava gerando NaN
            ('classifier', model)
        ])
    else:
        pipeline = Pipeline(steps=[
            ('string_encoder', string_encoder),
            ('selector', selector),
            ('classifier', model)
        ])
        
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=n_iter,
        cv=cv,
        scoring=scoring,
        random_state=random_state,
        n_jobs=1,
        verbose=verbose
    )
    
    random_search.fit(X_train, y_train)
    
    best_estimator = random_search.best_estimator_
    best_params = random_search.best_params_
    
    return best_estimator, best_params

## Logistic Regression

In [15]:
param_distributions = {
    'classifier__C': np.logspace(-4, 4, 20),
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear'],
    'selector__features': [selection_boruta, selection_rfecv]
}

best_estimator, best_params = tune_model(
    LogisticRegression(n_jobs=-1, class_weight='balanced'),
    param_distributions,
    df_train.drop(columns=['fraude']),
    df_train['fraude'],
    scaler=True
)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


In [16]:
print('Best params: ', best_params)

print('Train metrics')
y_pred_train = best_estimator.predict_proba(df_train.drop(columns=['fraude']))[:, 1] * 100
print(generate_model_metrics(df_train['fraude'], y_pred_train, max_fpr=0.05))

print('Val metrics')
y_pred_val = best_estimator.predict_proba(df_val.drop(columns=['fraude']))[:, 1] * 100
print(generate_model_metrics(df_val['fraude'], y_pred_val, max_fpr=0.05))

Best params:  {'selector__features': ['monto', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'l', 'm', 'n', 'o', 'p', 'hour', 'dawn_operation', 'monto_div_a', 'monto_div_b', 'monto_div_c', 'monto_div_d', 'monto_div_e', 'monto_div_f', 'monto_div_h', 'monto_div_k', 'monto_div_l', 'monto_div_m', 'monto_div_hour', 'monto_div_weekday', 'f_lower', 'l_lower', 'm_lower', 'n_lower'], 'classifier__solver': 'liblinear', 'classifier__penalty': 'l1', 'classifier__C': np.float64(0.615848211066026)}
Train metrics
{'ROC_AUC@0.05': 0.6300248049859843, 'Recall@0.05': 0.3495127436281859, 'Precision@0.05': 0.361925092179313}
Val metrics
{'ROC_AUC@0.05': 0.6293680828890372, 'Recall@0.05': 0.3524451939291737, 'Precision@0.05': 0.3647469458987784}


## Random Forest

In [17]:
param_distributions = {
    'classifier__n_estimators': [50, 100, 200, 300],
    'classifier__max_depth': [3, 4, 5, 6, 7, 8],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__min_samples_leaf': [4, 8, 16, 32],
    'selector__features': [selection_boruta, selection_rfecv]
}

best_estimator, best_params = tune_model(
    RandomForestClassifier(n_jobs=-1, class_weight='balanced'),
    param_distributions,
    df_train.drop(columns=['fraude']),
    df_train['fraude'],
)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


In [18]:
print('Best params: ', best_params)

print('Train metrics')
y_pred_train = best_estimator.predict_proba(df_train.drop(columns=['fraude']))[:, 1] * 100
print(generate_model_metrics(df_train['fraude'], y_pred_train, max_fpr=0.05))

print('Val metrics')
y_pred_val = best_estimator.predict_proba(df_val.drop(columns=['fraude']))[:, 1] * 100
print(generate_model_metrics(df_val['fraude'], y_pred_val, max_fpr=0.05))

Best params:  {'selector__features': ['monto', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'l', 'm', 'n', 'o', 'p', 'hour', 'dawn_operation', 'monto_div_a', 'monto_div_b', 'monto_div_c', 'monto_div_d', 'monto_div_e', 'monto_div_f', 'monto_div_h', 'monto_div_k', 'monto_div_l', 'monto_div_m', 'monto_div_hour', 'monto_div_weekday', 'f_lower', 'l_lower', 'm_lower', 'n_lower'], 'classifier__n_estimators': 300, 'classifier__min_samples_leaf': 16, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 4}
Train metrics
{'ROC_AUC@0.05': 0.6212135966279939, 'Recall@0.05': 0.3268365817091454, 'Precision@0.05': 0.33844362507277315}
Val metrics
{'ROC_AUC@0.05': 0.6093464962697307, 'Recall@0.05': 0.31197301854974707, 'Precision@0.05': 0.3228621291448517}


## LGBM

In [19]:
param_distributions = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'classifier__num_leaves': [7, 15, 31, 51],
    'classifier__learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'selector__features': [selection_boruta, selection_rfecv]
}

best_estimator, best_params = tune_model(
    LGBMClassifier(n_jobs=-1, class_weight='balanced', verbose=-1),
    param_distributions,
    df_train.drop(columns=['fraude']),
    df_train['fraude'],
)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


In [20]:
print('Best params: ', best_params)

print('Train metrics')
y_pred_train = best_estimator.predict_proba(df_train.drop(columns=['fraude']))[:, 1] * 100
print(generate_model_metrics(df_train['fraude'], y_pred_train, max_fpr=0.05))

print('Val metrics')
y_pred_val = best_estimator.predict_proba(df_val.drop(columns=['fraude']))[:, 1] * 100
print(generate_model_metrics(df_val['fraude'], y_pred_val, max_fpr=0.05))

Best params:  {'selector__features': ['a', 'b', 'f', 'l', 'm', 'n', 'o', 'p', 'monto_div_c', 'monto_div_d', 'monto_div_f', 'monto_div_h', 'monto_div_l', 'monto_div_m', 'monto_div_hour', 'f_lower', 'l_lower', 'm_lower', 'n_lower'], 'classifier__subsample': 0.8, 'classifier__num_leaves': 51, 'classifier__n_estimators': 100, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.03}
Train metrics
{'ROC_AUC@0.05': 0.6487485559922371, 'Recall@0.05': 0.37724887556221887, 'Precision@0.05': 0.3906462254997089}
Val metrics
{'ROC_AUC@0.05': 0.6353317733501662, 'Recall@0.05': 0.35413153456998314, 'Precision@0.05': 0.36649214659685864}


## XGBoost

In [21]:
num_neg = (df_train['fraude'] == 0).sum()
num_pos = (df_train['fraude'] == 1).sum()
scale_pos_weight = num_neg / num_pos

param_distributions = {
    'classifier__n_estimators': [100, 200, 300, 500],
    'classifier__max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'classifier__learning_rate': [0.01, 0.025, 0.05, 0.1, 0.2],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__scale_pos_weight': [scale_pos_weight],
    'selector__features': [selection_boruta, selection_rfecv]
}

best_estimator, best_params = tune_model(
    XGBClassifier(n_jobs=-1, use_label_encoder=False, eval_metric='logloss', verbosity=0),
    param_distributions,
    df_train.drop(columns=['fraude']),
    df_train['fraude'],
)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


In [22]:
print('Best params: ', best_params)

print('Train metrics')
y_pred_train = best_estimator.predict_proba(df_train.drop(columns=['fraude']))[:, 1] * 100
print(generate_model_metrics(df_train['fraude'], y_pred_train, max_fpr=0.05))

print('Val metrics')
y_pred_val = best_estimator.predict_proba(df_val.drop(columns=['fraude']))[:, 1] * 100
print(generate_model_metrics(df_val['fraude'], y_pred_val, max_fpr=0.05))

Best params:  {'selector__features': ['a', 'b', 'f', 'l', 'm', 'n', 'o', 'p', 'monto_div_c', 'monto_div_d', 'monto_div_f', 'monto_div_h', 'monto_div_l', 'monto_div_m', 'monto_div_hour', 'f_lower', 'l_lower', 'm_lower', 'n_lower'], 'classifier__subsample': 0.8, 'classifier__scale_pos_weight': np.float64(18.31315592203898), 'classifier__n_estimators': 500, 'classifier__max_depth': 9, 'classifier__learning_rate': 0.1, 'classifier__colsample_bytree': 0.6}
Train metrics
{'ROC_AUC@0.05': 0.9999898110293992, 'Recall@0.05': 0.9653298350824587, 'Precision@0.05': 0.9996118765767514}
Val metrics
{'ROC_AUC@0.05': 0.6329545165740267, 'Recall@0.05': 0.3524451939291737, 'Precision@0.05': 0.3647469458987784}


In [24]:
len(['a', 'b', 'f', 'l', 'm', 'n', 'o', 'p', 'monto_div_c', 'monto_div_d', 'monto_div_f', 'monto_div_h', 'monto_div_l', 'monto_div_m', 'monto_div_hour', 'f_lower', 'l_lower', 'm_lower', 'n_lower'])

19

--- 
# Resumo

O modelo que obteve maior resultado no conjunto de validação foi o `LGBMClassifier` com features `selected_by_rfecv`, ainda sim com um pouco de overfitting seguiremos com ele e corrigiremos este ponto no próximo tunning mais detalhado com `Optuna`, onde vamos otimizar duas métricas:
- maximizar ROC_AUC@0.05
- minimizar a diferença de ROC_AUC@0.05 no treino e validação