# Modelo de Identificação de Fraudes

In [1]:
# Importando o pandas

import pandas as pd

In [2]:
# Importando a base

transacoes = pd.read_csv('creditcard.csv')

### Criando um modelo de aprendizado de máquinas para identificar fraude
- Vamos criar um modelo para identificar fraude sem fazer nenhum tratamento nessa base

**Podemos ajustar as colunas Time e Amount para que elas fiquem entre 0 e 1**

In [3]:
# Primeiro para a coluna Time

transacoes.Time = transacoes.Time / transacoes.Time.max()

In [4]:
# E então para a coluna Amount

transacoes.Amount = transacoes.Amount / transacoes.Amount.max()

In [5]:
# Separando X e y

X = transacoes.drop('Class',axis=1)
y = transacoes.Class

**Separando em treino e teste**

In [6]:
# Separando em treino e teste

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0,stratify=y)

### Realizando o random undersampling

In [7]:
# Importando o RandomUnderSampler do imblearn

from imblearn.under_sampling import RandomUnderSampler 

In [8]:
# Definindo o RandomUnderSampler

rus = RandomUnderSampler(random_state=42)

In [9]:
# Definindo a nova amostra

X_resRUS, y_resRUS = rus.fit_resample(X_train, y_train)

**<font color='blue'>Com Regressão Logística**

In [10]:
# Fazendo o fit

from sklearn.linear_model import LogisticRegression

clfLog_RUS = LogisticRegression(random_state=42).fit(X_resRUS, y_resRUS)

In [11]:
# Fazendo a previsão

y_predLog_RUS = clfLog_RUS.predict(X_test)

y_pred_probaLog_RUS = clfLog_RUS.predict_proba(X_test)[:,1]

In [12]:
# Visualizando a matriz de confusão

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predLog_RUS)

array([[91311,  2514],
       [   15,   147]], dtype=int64)

In [13]:
# Traçando a área sobre a curva precisão x recall

from sklearn import metrics

precisionLog_RUS,recallLog_RUS,thresholdsLog_RUS = metrics.precision_recall_curve(y_test,y_pred_probaLog_RUS)
print(metrics.auc(recallLog_RUS, precisionLog_RUS))

0.6949903066783492


In [14]:
# Calculando o recall

from sklearn.metrics import recall_score

recall_score(y_test, y_predLog_RUS)

0.9074074074074074

## Vamos criar um novo classificador mudando parâmetros

In [15]:
# Fazendo o novo fit

from sklearn.linear_model import LogisticRegression

clfLog_RUS2 = LogisticRegression(random_state=42, solver='newton-cg', C=100).fit(X_resRUS, y_resRUS)

In [16]:
# Fazendo a nova previsão

y_predLog_RUS2 = clfLog_RUS2.predict(X_test)

y_pred_probaLog_RUS2 = clfLog_RUS2.predict_proba(X_test)[:,1]

In [17]:
# Visualizando a nova matriz de confusão

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,y_predLog_RUS2)

array([[90223,  3602],
       [   14,   148]], dtype=int64)

In [18]:
# Traçando a nova área sobre a curva precisão x recall

from sklearn import metrics

precisionLog_RUS2,recallLog_RUS2,thresholdsLog_RUS2 = metrics.precision_recall_curve(y_test,y_pred_probaLog_RUS2)
print(metrics.auc(recallLog_RUS2, precisionLog_RUS2))

0.7414892735162674


In [19]:
# Calculando o novo recall

from sklearn.metrics import recall_score

recall_score(y_test, y_predLog_RUS2)

0.9135802469135802

**Podemos então [voltar](#regressao) começar a ajustar alguns parâmetros**

In [21]:
# Retirando temporariamente os warnings do nosso código

import warnings

warnings.filterwarnings('ignore')

**Vamos utilizar o GridSearchCV para automatizar a busca por melhores parâmetros**

In [48]:
# Importando o GridSearchCV

from sklearn.model_selection import GridSearchCV

In [59]:
# Definindo os parâmetros que queremos testar

parametros = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [70]:
# Selecionando a Regressão Logística

LogReg = LogisticRegression(random_state=42)

In [71]:
# Criando um novo classificador usando os parâmetros que escolhemos anteriormente

clf_GS = GridSearchCV(LogReg, parametros, scoring='recall')

In [72]:
# Fazendo o fit dos nossos dados

clf_GS = clf_GS.fit(X_resRUS, y_resRUS)

In [73]:
# Visualizando os melhores parâmetros definidos pelo GridSearchCV 

c.best_params_

{'C': 0.001, 'solver': 'liblinear'}

In [74]:
# Usando esse modelo para fazer as previsões

y_pred_GS = clf_GS.predict(X_test)

In [75]:
# Analisando a matriz de confusão

confusion_matrix(y_test, y_pred_GS)

array([[85217,  8608],
       [   10,   152]], dtype=int64)

In [76]:
# O recall

recall_score(y_test, y_pred_GS)

0.9382716049382716

In [78]:
# E a precisão

from sklearn.metrics import precision_score

precision_score(y_test, y_pred_GS)

0.017351598173515982

In [80]:
# Podemos visualizar tudo que foi feito

pd.DataFrame(clf_GS.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.022985,0.006028694,0.005596,0.0004899603,0.001,newton-cg,"{'C': 0.001, 'solver': 'newton-cg'}",0.757576,0.848485,0.863636,0.80303,0.863636,0.827273,0.041328,33
1,0.011393,0.001018758,0.005996,0.0006325608,0.001,lbfgs,"{'C': 0.001, 'solver': 'lbfgs'}",0.757576,0.848485,0.863636,0.80303,0.863636,0.827273,0.041328,33
2,0.006995,3.989506e-07,0.004997,3.234067e-07,0.001,liblinear,"{'C': 0.001, 'solver': 'liblinear'}",0.909091,0.969697,0.954545,0.924242,0.954545,0.942424,0.022268,1
3,0.041574,0.002057296,0.005397,0.0007996083,0.001,sag,"{'C': 0.001, 'solver': 'sag'}",0.757576,0.848485,0.863636,0.80303,0.863636,0.827273,0.041328,33
4,0.054366,0.002496139,0.004797,0.0009799978,0.001,saga,"{'C': 0.001, 'solver': 'saga'}",0.772727,0.863636,0.863636,0.80303,0.893939,0.839394,0.044536,32
5,0.021387,0.002058245,0.005596,0.0004903496,0.01,newton-cg,"{'C': 0.01, 'solver': 'newton-cg'}",0.833333,0.939394,0.878788,0.848485,0.893939,0.878788,0.037113,30
6,0.01719,0.00478845,0.007595,0.001623728,0.01,lbfgs,"{'C': 0.01, 'solver': 'lbfgs'}",0.833333,0.939394,0.878788,0.848485,0.893939,0.878788,0.037113,30
7,0.008194,0.001468679,0.005597,0.0004897848,0.01,liblinear,"{'C': 0.01, 'solver': 'liblinear'}",0.863636,0.969697,0.909091,0.878788,0.909091,0.906061,0.036364,7
8,0.040774,0.005596229,0.005797,0.0003998288,0.01,sag,"{'C': 0.01, 'solver': 'sag'}",0.833333,0.954545,0.893939,0.848485,0.893939,0.884848,0.042424,28
9,0.050368,0.00402767,0.004798,0.0007486728,0.01,saga,"{'C': 0.01, 'solver': 'saga'}",0.833333,0.954545,0.893939,0.848485,0.893939,0.884848,0.042424,28
