# Seleção de Modelos: Avaliação e Validação

In [186]:
import os
import joblib
import pandas as pd
from dotenv import load_dotenv
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 
from sklearn.metrics import classification_report

In [187]:
load_dotenv()
random_state = int(os.environ['SKLEARN_RANDOM_STATE'])

print(os.environ['NOTEBOOKS_PROCESSED_DATA_PATH'])
print(os.environ['SKLEARN_RANDOM_STATE'])

../data/processed/
42


## Carregando os dados

In [6]:
df = pd.read_parquet(
    os.environ['NOTEBOOKS_PROCESSED_DATA_PATH'] + \
        'creditcard_train.parquet'
)

df.shape

(227845, 31)

In [8]:
X = df.drop('Fraude', axis=1).values
y = df['Fraude'].values

X.shape, y.shape

((227845, 30), (227845,))

## Balanceamento de classes

## Normalização

## Selecionando os modelos com validação cruzada

In [129]:
models = {
    'qda': QuadraticDiscriminantAnalysis(),
    'gnb': GaussianNB(),
    'xgb': XGBClassifier(random_state=random_state),
    'rf': RandomForestClassifier(random_state=random_state)
}

In [130]:
kfold = StratifiedKFold(n_splits=5)

In [55]:
%%time

reports = []

for model_name, model in models.items():
    report = []
    for i, (train_index, val_index) in enumerate(kfold.split(X, y)):
        model.fit(X[train_index], y[train_index])
        y_pred = model.predict(X[val_index])

        partial_report = classification_report(
            y[val_index],
            y_pred,
            output_dict=True
        )

        partial_report = pd.DataFrame(partial_report)
        report.append(partial_report)

    report = sum(report) / len(report)
    reports.append(report)
    reports = pd.concat(reports)

Wall time: 14min 47s


In [128]:
index_tuples = [
    [
        (model_name, 'precision'),
        (model_name, 'recall'),
        (model_name, 'f1-score'),
        (model_name, 'support')
    ] for model_name in list(models.keys())
]

index_tuples = [index for level in index_tuples \
                      for index in level]

index = pd.MultiIndex.from_tuples(
    index_tuples,
    names=['model', 'metric']
)

reports = reports.set_index(index)
reports = reports.sort_values(by=['accuracy'], ascending=False)
reports

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,accuracy,macro avg,weighted avg
model,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rf,precision,0.999925,0.778673,0.999535,0.889299,0.999535
rf,recall,0.999767,0.854535,0.999535,0.927151,0.999511
rf,f1-score,0.999609,0.947904,0.999535,0.973756,0.999518
rf,support,45488.6,80.4,0.999535,45569.0,45569.0
xgb,precision,0.999921,0.778704,0.99953,0.889312,0.99953
xgb,recall,0.999765,0.853402,0.99953,0.926584,0.999506
xgb,f1-score,0.999609,0.945437,0.99953,0.972523,0.999513
xgb,support,45488.6,80.4,0.99953,45569.0,45569.0
gnb,precision,0.999363,0.146781,0.992719,0.573072,0.997859
gnb,recall,0.996342,0.23864,0.992719,0.617491,0.995005


## Salvando o melhor modelo

In [190]:
joblib.dump(
    models['xgb'],
    os.environ['NOTEBOOKS_MODELS_PATH'] + \
        'xgb_fraud_detection_train.sav'
)

['../models/xgb_fraud_detection_train.sav']