# Detecção de Fraudes: *Baseline* e *Benchmark*

In [8]:
import json
import joblib
import requests

import numpy as np
import pandas as pd

from flask import Flask, jsonify, request

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.dummy import DummyClassifier

from lightgbm import LGBMClassifier

In [10]:
df = pd.read_parquet('../data/processed/creditcard.parquet')
df.shape

(284807, 31)

In [12]:
# class balance
display(df['class'].value_counts())
df['class'].value_counts().values / len(df['class'])

0    284315
1       492
Name: class, dtype: int64

array([0.99827251, 0.00172749])

In [13]:
# missing values
df.isnull().sum().sum() / len(df)

0.0

In [14]:
X = df.drop('class', axis=1).values
y = df['class'].values

X.shape, y.shape

((284807, 30), (284807,))

In [15]:
(X_train,
 X_test,
 y_train,
 y_test) = train_test_split(
    X, y,
    test_size=.3,
    shuffle=False
)

X_train.shape, y_train.shape

((199364, 30), (199364,))

In [16]:
dummy = DummyClassifier(strategy='stratified')
dummy.fit(X_train, y_train)
y_pred = dummy.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85335
           1       0.01      0.01      0.01       108

    accuracy                           1.00     85443
   macro avg       0.50      0.50      0.50     85443
weighted avg       1.00      1.00      1.00     85443



In [17]:
model = LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85335
           1       0.09      0.03      0.04       108

    accuracy                           1.00     85443
   macro avg       0.55      0.51      0.52     85443
weighted avg       1.00      1.00      1.00     85443



In [19]:
# serializando o modelo

model_path = '../model/lgbm_fraud_detection.sav'
joblib.dump(model, model_path)

['../model/lgbm_fraud_detection.sav']

In [21]:
# realizando uma predição para uma amostra

sample = df.iloc[-1][:-1]
sample_json = sample.to_json()

model.predict([sample])

array([0], dtype=int64)

In [None]:
# realizando um request para o modelo

response = requests.post(
    'http://localhost:5025/predict',
    json=sample_json
)

print(response.json())

## **CRIAÇÃO DE MODELOS BASELINES**

In [None]:
baseline_preds = {
    'dummy_stratified': None,
    'dummy_random': None,
    'lgbm_prototype': None
}

In [None]:
%%time

strategies = ['stratified', 'uniform']

for strategy in strategies:
    dummy_stratified = DummyClassifier(strategy=strategy)
    dummy_stratified.fit(X_train, y_train)
    baseline_preds[f'dummy_{strategy}'] = dummy_stratified.predict(X_test)

    print('strategy =', strategy)
    
    print(classification_report(
        y_test,
        baseline_preds[f'dummy_{strategy}']
    ))

strategy = stratified
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.00      0.00      0.00       148

    accuracy                           1.00     85443
   macro avg       0.50      0.50      0.50     85443
weighted avg       1.00      1.00      1.00     85443

strategy = uniform
              precision    recall  f1-score   support

           0       1.00      0.50      0.67     85295
           1       0.00      0.54      0.00       148

    accuracy                           0.50     85443
   macro avg       0.50      0.52      0.34     85443
weighted avg       1.00      0.50      0.67     85443

Wall time: 217 ms


In [None]:
%%time

model_name = 'lgbm_prototype'
lgbm_prototype = LGBMClassifier()
lgbm_prototype.fit(X_train, y_train)
baseline_preds['lgbm_prototype'] = lgbm_prototype.predict(X_test)

print(model_name + '\n')

print(classification_report(
        y_test,
        baseline_preds[model_name]
))

lgbm_prototype

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.36      0.59      0.45       148

    accuracy                           1.00     85443
   macro avg       0.68      0.80      0.72     85443
weighted avg       1.00      1.00      1.00     85443

Wall time: 1.79 s
