# Modelo de Identificação de Fraudes

- Informações da base:
    - 'Time' (tempo): contém os segundos decorridos entre cada transação e a primeira transação no conjunto de dados. 
    - 'Amount' (valor): é o valor da transação 
    - 'Class' (classe): é a variável de resposta e assume valor 1 em caso de fraude e 0 caso contrário.
    - "Infelizmente, devido a questões de confidencialidade, não podemos fornecer os recursos originais e mais informações básicas sobre os dados. Características V1, V2, … V28 são os principais componentes obtidos com PCA"

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt
import xgboost as xgb

from scipy.stats import kstest, normaltest, anderson

from sklearn.preprocessing import PowerTransformer, MinMaxScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss 
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTEENN

from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score, 
    precision_score, 
    recall_score,
    precision_recall_curve,
)

import warnings
from src.config import DADOS_CREDICARD, DADOS_CREDICARD_TRATADO
from src.auxiliares_ml import downcast_dataframe, testar_modelos_com_undersampling

warnings.filterwarnings('ignore')
sns.set_theme(palette="bright")

In [3]:
# Importando a base

transacoes = pd.read_parquet(DADOS_CREDICARD_TRATADO)

In [4]:
# Visualizando estatísticas descritivas da base 

transacoes.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.867188,1.339397e-08,0.0,-4.8004e-08,6.643411e-09,2.657364e-08,-1.500125e-09,-1.071518e-09,-1.071518e-10,-3.214554e-10,...,-1.928732e-09,3.643161e-09,3.214554e-10,1.259033e-09,2.143036e-09,1.034684e-09,1.138488e-10,-1.272427e-10,88.349619,0.001727
std,47487.195312,1.958611,1.651183,1.516203,1.415832,1.38018,1.332214,1.23702,1.194202,1.098575,...,0.7344701,0.7256873,0.62441,0.6056016,0.521262,0.4822155,0.4035722,0.3300574,250.120109,0.041527
min,0.0,-56.40751,-72.715729,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.59855,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.065486,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097605,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.803724,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273458,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382559,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519588,3.517346,31.6122,33.84781,25691.16,1.0


### Pré-processamento: 

- Utilizaremos o **PowerTransformer** devido ao grande número de outliers, **xceto em 'Time'**. <br>
- Em **'Time'** utilizaremos o **MinMaxScaler**. <br>
- Vamos utilizar o **RandomUnderSampler** para balancear nossa base.

In [6]:
# Criando os escaladores

power_transformer = PowerTransformer(method='yeo-johnson')  # Funciona para valores positivos e negativos
minmax_scaler = MinMaxScaler()

# Aplicando a normalização nas colunas específicas

transacoes['Amount'] = power_transformer.fit_transform(transacoes[['Amount']])
transacoes['Time'] = minmax_scaler.fit_transform(transacoes[['Time']])


transacoes.head(2)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,1.119013,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-1.128464,0


In [7]:
# Separando X e y

X = transacoes.drop(columns='Class')
y = transacoes['Class']

### Fazendo Cross-validation com StratifiedKFold e utilizando o RandomUnderSampler

In [9]:
# Configuração do StratifiedKFold

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# Criando o RandomUnderSampler

rus = RandomUnderSampler(random_state=0)

# Lista para armazenar os conjuntos de treino e teste de cada dobra

folds_data = []

for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Aplicando o balanceamento com RandomUnderSampler nos dados de treino
    
    X_train_bal, y_train_bal = rus.fit_resample(X_train, y_train)

    # Armazenando os dados da dobra
    
    folds_data.append((X_train_bal, y_train_bal, X_test, y_test))

print("Cross-validation com balanceamento concluído. Agora podemos testar os modelos.")

Cross-validation com balanceamento concluído. Agora podemos testar os modelos.


### XGBoost 

In [11]:
# Criando o modelo XGBoost

modelo_xgb = xgb.XGBClassifier(
    max_depth=9,
    n_estimators=200,
    learning_rate=0.3,
    random_state=0,
    scale_pos_weight=(y.value_counts()[0] / y.value_counts()[1])  # Lida com desbalanceamento
)

# Loop pelas dobras do cross-validation

for i, (X_train_bal, y_train_bal, X_test, y_test) in enumerate(folds_data):
    modelo_xgb.fit(X_train_bal, y_train_bal)  # Treina no conjunto balanceado
    y_pred_xgb = modelo_xgb.predict(X_test)  # Testa no conjunto original
    
    # Calculando métricas
    
    acc = accuracy_score(y_test, y_pred_xgb)
    prec = precision_score(y_test, y_pred_xgb, zero_division=0)  # zero_division evita erros se não houver positivos
    rec = recall_score(y_test, y_pred_xgb)

    # Exibindo os resultados
    
    print(f"Dobra {i+1}:")
    print(f"  Acurácia:  {acc:.4f}")
    print(f"  Precisão:  {prec:.4f}")
    print(f"  Recall:    {rec:.4f}\n")

Dobra 1:
  Acurácia:  0.9108
  Precisão:  0.0178
  Recall:    0.9329

Dobra 2:
  Acurácia:  0.9028
  Precisão:  0.0164
  Recall:    0.9390

Dobra 3:
  Acurácia:  0.9064
  Precisão:  0.0171
  Recall:    0.9390



### Random Forest

In [13]:
# Utilizando o Random Forest

clf_RF = RandomForestClassifier(
    max_depth=7,  
    random_state=0,
    criterion='gini',
    n_estimators=100,  
    class_weight='balanced_subsample' 
).fit(X_train, y_train)

# Loop pelas dobras do cross-validation

for i, (X_train_bal, y_train_bal, X_test, y_test) in enumerate(folds_data):
    clf_RF.fit(X_train_bal, y_train_bal)  # Treina no conjunto balanceado
    y_pred_RF = clf_RF.predict(X_test)  # Testa no conjunto original
    
    # Calculando métricas
    
    acc = accuracy_score(y_test, y_pred_RF)
    prec = precision_score(y_test, y_pred_RF, zero_division=0)  # zero_division evita erros se não houver positivos
    rec = recall_score(y_test, y_pred_RF)

    # Exibindo os resultados
    
    print(f"Dobra {i+1}:")
    print(f"  Acurácia:  {acc:.4f}")
    print(f"  Precisão:  {prec:.4f}")
    print(f"  Recall:    {rec:.4f}\n")

Dobra 1:
  Acurácia:  0.9828
  Precisão:  0.0821
  Recall:    0.8780

Dobra 2:
  Acurácia:  0.9857
  Precisão:  0.0995
  Recall:    0.9024

Dobra 3:
  Acurácia:  0.9667
  Precisão:  0.0449
  Recall:    0.9024



### OBSERVAÇÃO:

**Utilizamos apenas os dois algoritmos que se saíram melhor até aqui:**.
- XGBoost
- Random Forest

### NOTA:

Observamos que sempre que executamos o balanceamento da base, os resultados não são satisfatórios.