# Modelo de Identificação de Fraudes

- Informações da base:
    - 'Time' (tempo): contém os segundos decorridos entre cada transação e a primeira transação no conjunto de dados. 
    - 'Amount' (valor): é o valor da transação 
    - 'Class' (classe): é a variável de resposta e assume valor 1 em caso de fraude e 0 caso contrário.
    - "Infelizmente, devido a questões de confidencialidade, não podemos fornecer os recursos originais e mais informações básicas sobre os dados. Características V1, V2, … V28 são os principais componentes obtidos com PCA"

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt
import xgboost as xgb

from scipy.stats import kstest, normaltest, anderson

from sklearn.preprocessing import PowerTransformer, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss 
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTEENN

from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score, 
    precision_score, 
    recall_score,
    precision_recall_curve,
)

import warnings
from src.config import DADOS_CREDICARD, DADOS_CREDICARD_TRATADO
from src.auxiliares_ml import downcast_dataframe, testar_modelos_com_undersampling

warnings.filterwarnings('ignore')
sns.set_theme(palette="bright")

In [3]:
# Importando a base

transacoes = pd.read_parquet(DADOS_CREDICARD_TRATADO)

In [4]:
# Visualizando estatísticas descritivas da base 

transacoes.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.867188,1.339397e-08,0.0,-4.8004e-08,6.643411e-09,2.657364e-08,-1.500125e-09,-1.071518e-09,-1.071518e-10,-3.214554e-10,...,-1.928732e-09,3.643161e-09,3.214554e-10,1.259033e-09,2.143036e-09,1.034684e-09,1.138488e-10,-1.272427e-10,88.349619,0.001727
std,47487.195312,1.958611,1.651183,1.516203,1.415832,1.38018,1.332214,1.23702,1.194202,1.098575,...,0.7344701,0.7256873,0.62441,0.6056016,0.521262,0.4822155,0.4035722,0.3300574,250.120109,0.041527
min,0.0,-56.40751,-72.715729,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.59855,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.065486,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097605,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.803724,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273458,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382559,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519588,3.517346,31.6122,33.84781,25691.16,1.0


### NOTA: Vamos melhorar a escala das features

In [6]:
# Lista de colunas a serem normalizadas (excluindo 'Class')

colunas_para_normalizar = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
                           'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
                           'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

# Normalizando as colunas (min-max scaling)

transacoes[colunas_para_normalizar] = transacoes[colunas_para_normalizar] / transacoes[colunas_para_normalizar].max()

In [7]:
# Visualizando estatísticas descritivas da base normalizada

transacoes.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,0.548717,-9.115664e-11,2.55649e-12,6.094481e-12,-3.268473e-12,7.593905e-13,-2.706109e-12,-9.158887e-13,2.672787e-12,1.551733e-12,...,-4.199505e-13,3.833444e-12,-4.280388e-13,1.039828e-11,-9.60241e-13,-1.266194e-11,7.359425e-13,-2.957288e-13,0.003439,0.001727
std,0.274828,0.7978622,0.07486303,0.1616036,0.08390162,0.03966036,0.01817519,0.01025872,0.05969613,0.07044774,...,0.02700174,0.0690941,0.02771879,0.1321061,0.06932269,0.1370997,0.01276825,0.009751983,0.009736,0.041527
min,0.0,-22.97724,-3.29661,-5.150577,-0.3367737,-3.268329,-0.3568885,-0.3612026,-3.659517,-0.8614345,...,-1.280395,-1.040945,-1.988943,-0.6187363,-1.369144,-0.7404875,-0.7138282,-0.4558666,0.0,0.0
25%,0.313681,-0.3749082,-0.02713561,-0.09489574,-0.05028876,-0.01987253,-0.01048129,-0.004594728,-0.01042773,-0.04123744,...,-0.008395997,-0.05163722,-0.007184099,-0.07734374,-0.04217585,-0.09296326,-0.002240892,-0.001564645,0.000218,0.0
50%,0.490138,0.007376503,0.002968826,0.01916816,-0.001176067,-0.0015613,-0.003740532,0.0003325587,0.001117499,-0.003297772,...,-0.001082614,0.0006457093,-0.0004968362,0.00893786,0.002206703,-0.01482342,4.245658e-05,0.0003321879,0.000856,0.0
75%,0.80629,0.5359182,0.03643729,0.1094793,0.04404896,0.01758325,0.005437327,0.004730396,0.0163614,0.03829043,...,0.006851388,0.05032363,0.006553594,0.09587129,0.04664026,0.06850398,0.002880063,0.002312704,0.003004,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Separando X e y

X = transacoes.drop(columns='Class')
y = transacoes['Class']

In [9]:
# Separando em treino e teste

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0,stratify=y)

### XGBoost 

In [11]:
# Criando o modelo XGBoost

modelo_xgb = xgb.XGBClassifier(
    max_depth=9,
    n_estimators=200,
    learning_rate=0.3,
    random_state=0,
    scale_pos_weight=(y.value_counts()[0] / y.value_counts()[1])  # Lida com desbalanceamento
)

# Treinando e avaliando o modelo

modelo_xgb.fit(X_train, y_train)

y_pred_xgb = modelo_xgb.predict(X_test)

# Calculando as métricas

acc_xgb = accuracy_score(y_test, y_pred_xgb)
prec_xgb = precision_score(y_test, y_pred_xgb)
rec_xgb = recall_score(y_test, y_pred_xgb)

# Vusualizando a matriz confusão

confusion_matrix(y_test, y_pred_xgb)

array([[93817,     8],
       [   34,   128]], dtype=int64)

In [12]:
# Utilizando as métricas de avaliação

print('Acurácia:', accuracy_score(y_test, y_pred_xgb))
print('Precisão:', precision_score(y_test, y_pred_xgb))
print('Recall:', recall_score(y_test, y_pred_xgb, pos_label=1))

Acurácia: 0.9995531296881484
Precisão: 0.9411764705882353
Recall: 0.7901234567901234


### Random Forest

In [14]:
# Utilizando o Random Forest

clf_RF = RandomForestClassifier(
    max_depth=7,  
    random_state=0,
    criterion='gini',
    n_estimators=100,  
    class_weight='balanced_subsample' 
).fit(X_train, y_train)

y_pred_RF = clf_RF.predict(X_test)

# Vusualizando a matriz confusão

confusion_matrix(y_test, y_pred_RF)

array([[93788,    37],
       [   31,   131]], dtype=int64)

In [15]:
# Utilizando as métricas de avaliação

print('Acurácia:', accuracy_score(y_test, y_pred_RF))
print('Precisão:', precision_score(y_test, y_pred_RF))
print('Recall:', recall_score(y_test, y_pred_RF))

Acurácia: 0.9992764956855735
Precisão: 0.7797619047619048
Recall: 0.808641975308642


### Regressão Logística

In [17]:
# Utilizando a Regressão Logística

clfLog = LogisticRegression(random_state=0).fit(X_train, y_train)

# Calculando as métricas

y_predLog = clfLog.predict(X_test)
y_pred_probaLog = clfLog.predict_proba(X_test)[:,1]

# Vusualizando a matriz confusão

confusion_matrix(y_test, y_predLog)

array([[93811,    14],
       [   70,    92]], dtype=int64)

In [18]:
# Utilizando as métricas de avaliação

print('Acurácia:', accuracy_score(y_test, y_predLog))
print('Precisão:', precision_score(y_test, y_predLog))
print('Recall:', recall_score(y_test, y_predLog))

Acurácia: 0.9991062593762967
Precisão: 0.8679245283018868
Recall: 0.5679012345679012


### OBSERVAÇÃO:

**Utilizamos a base desbalanceada e fizemos uma espécie de 'normalização' nos dados**.

- Utilizando os três melhores algoritmos até aqui, temos como desempenho:
  
  - **1º Lugar**: XGBoost
  - **2º Lugar**: Random Forest
  - **3º Lugar**: Regressão Logística