# Modelo de Identificação de Fraudes

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt

from scipy.stats import kstest, normaltest, anderson

from sklearn.preprocessing import PowerTransformer, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss 
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTEENN

from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score, 
    precision_score, 
    recall_score,
    precision_recall_curve,
)

import warnings
from src.config import DADOS_MASTERCARD_TRATADO

warnings.filterwarnings('ignore')
sns.set_theme(palette="bright")

In [3]:
# Importando a base

transacoes = pd.read_parquet(DADOS_MASTERCARD_TRATADO)

In [4]:
# Verificar se todos os valores podem ser convertidos para inteiro (sem valores decimais)

if (transacoes['Time'] == transacoes['Time'].astype(int)).all():
    
    # Se todos os valores são inteiros, realizar a conversão
    
    transacoes['Time'] = transacoes['Time'].astype(int)
   
else:
    print("Nem todos os valores na coluna 'Time' podem ser convertidos para inteiro.")

### Vamos ajustar a escala de duas colunas para testar o resultado
- As colunas Time e Amount têm valores max muito alto. Destoa do restante do dataset.

In [6]:
# Primeiro para a coluna Time

transacoes.Time = transacoes.Time / transacoes.Time.max()

# E então para a coluna Amount

transacoes.Amount = transacoes.Amount / transacoes.Amount.max()

In [7]:
# Separando X e y

X = transacoes.drop('Class',axis=1)
y = transacoes.Class

**Separando em treino e teste**

In [9]:
# Separando em treino e teste

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0,stratify=y)

### Realizando o random undersampling

In [11]:
# Definindo o RandomUnderSampler

rus = RandomUnderSampler(random_state=42)

In [12]:
# Definindo a nova amostra

X_resRUS, y_resRUS = rus.fit_resample(X_train, y_train)

**<font color='blue'>Com Regressão Logística**

In [14]:
# Fazendo o fit

clfLog_RUS = LogisticRegression(random_state=42).fit(X_resRUS, y_resRUS)

In [15]:
# Fazendo a previsão

y_predLog_RUS = clfLog_RUS.predict(X_test)

y_pred_probaLog_RUS = clfLog_RUS.predict_proba(X_test)[:,1]

In [16]:
# Visualizando a matriz confusão

confusion_matrix(y_test,y_predLog_RUS)

array([[91301,  2524],
       [   15,   147]], dtype=int64)

In [17]:
# Calculando a área sob a curva precisão x recall

precisionLog_RUS,recallLog_RUS,thresholdsLog_RUS = metrics.precision_recall_curve(y_test,y_pred_probaLog_RUS)
print(metrics.auc(recallLog_RUS, precisionLog_RUS))

0.695159942917231


In [18]:
# Calculando o recall

recall_score(y_test, y_predLog_RUS)

0.9074074074074074

## Vamos criar um novo classificador mudando parâmetros

In [20]:
# Fazendo o novo fit

clfLog_RUS2 = LogisticRegression(random_state=42, solver='newton-cg', C=100).fit(X_resRUS, y_resRUS)

In [21]:
# Fazendo a nova previsão

y_predLog_RUS2 = clfLog_RUS2.predict(X_test)

y_pred_probaLog_RUS2 = clfLog_RUS2.predict_proba(X_test)[:,1]

In [22]:
# Visualizando a nova matriz confusão

confusion_matrix(y_test,y_predLog_RUS2)

array([[90204,  3621],
       [   14,   148]], dtype=int64)

In [23]:
# Traçando a nova área sobre a curva precisão x recall

precisionLog_RUS2,recallLog_RUS2,thresholdsLog_RUS2 = metrics.precision_recall_curve(y_test,y_pred_probaLog_RUS2)
print(metrics.auc(recallLog_RUS2, precisionLog_RUS2))

0.7480857800849602


In [24]:
# Calculando o novo recall

recall_score(y_test, y_predLog_RUS2)

0.9135802469135802

**Podemos então começar a ajustar alguns parâmetros**

**Vamos utilizar o GridSearchCV para automatizar a busca por melhores parâmetros**

In [27]:
# Definindo os parâmetros que queremos testar

parametros = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [28]:
# Selecionando a Regressão Logística

LogReg = LogisticRegression(random_state=42)

In [29]:
# Criando um novo classificador usando os parâmetros que escolhemos anteriormente

clf_GS = GridSearchCV(LogReg, parametros, scoring='recall')

In [30]:
# Fazendo o fit dos nossos dados

clf_GS = clf_GS.fit(X_resRUS, y_resRUS)

In [31]:
# Visualizando os melhores parâmetros definidos pelo GridSearchCV 

clf_GS.best_params_

{'C': 0.001, 'solver': 'liblinear'}

In [32]:
# Usando esse modelo para fazer as previsões

y_pred_GS = clf_GS.predict(X_test)

In [33]:
# Analisando a matriz confusão

confusion_matrix(y_test, y_pred_GS)

array([[85217,  8608],
       [   10,   152]], dtype=int64)

In [34]:
# O recall

recall_score(y_test, y_pred_GS)

0.9382716049382716

In [35]:
# E a precisão

precision_score(y_test, y_pred_GS)

0.017351598173515982

In [36]:
# Podemos visualizar tudo que foi feito

pd.DataFrame(clf_GS.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.035407,0.010659,0.011556,0.008056,0.001,newton-cg,"{'C': 0.001, 'solver': 'newton-cg'}",0.757576,0.848485,0.863636,0.80303,0.863636,0.827273,0.041328,33
1,0.0154,0.006373,0.012673,0.005758,0.001,lbfgs,"{'C': 0.001, 'solver': 'lbfgs'}",0.757576,0.848485,0.863636,0.80303,0.863636,0.827273,0.041328,33
2,0.010588,0.003059,0.004717,0.00281,0.001,liblinear,"{'C': 0.001, 'solver': 'liblinear'}",0.909091,0.969697,0.954545,0.924242,0.954545,0.942424,0.022268,1
3,0.042427,0.008749,0.007196,0.0072,0.001,sag,"{'C': 0.001, 'solver': 'sag'}",0.757576,0.848485,0.863636,0.80303,0.863636,0.827273,0.041328,33
4,0.048459,0.005429,0.011973,0.009424,0.001,saga,"{'C': 0.001, 'solver': 'saga'}",0.772727,0.863636,0.863636,0.80303,0.893939,0.839394,0.044536,32
5,0.046684,0.009138,0.012426,0.00589,0.01,newton-cg,"{'C': 0.01, 'solver': 'newton-cg'}",0.833333,0.939394,0.878788,0.848485,0.893939,0.878788,0.037113,30
6,0.02401,0.005701,0.00743,0.006978,0.01,lbfgs,"{'C': 0.01, 'solver': 'lbfgs'}",0.833333,0.939394,0.878788,0.848485,0.893939,0.878788,0.037113,30
7,0.010807,0.008596,0.00917,0.004483,0.01,liblinear,"{'C': 0.01, 'solver': 'liblinear'}",0.863636,0.969697,0.909091,0.878788,0.909091,0.906061,0.036364,9
8,0.048257,0.004511,0.001396,0.002792,0.01,sag,"{'C': 0.01, 'solver': 'sag'}",0.833333,0.954545,0.893939,0.848485,0.893939,0.884848,0.042424,28
9,0.055446,0.007979,0.004852,0.005213,0.01,saga,"{'C': 0.01, 'solver': 'saga'}",0.833333,0.954545,0.893939,0.848485,0.893939,0.884848,0.042424,28
