# Content
The dataset contains credit card transactions made by Brazilian cardholders. This dataset presents transactions over a two-day period, including 492 fraudulent cases out of a total of 284,807 transactions. The dataset is highly imbalanced, with the positive class (fraud) representing only 0.172% of all transactions.

The dataset includes only numerical variables, many of which were transformed using Principal Component Analysis (PCA). Due to confidentiality issues, we cannot provide details about the original features or further background information. The 'Target' feature is the response variable, indicating fraud with a value of 1 and non-fraudulent transactions with a value of 0.

In [None]:
# importe bibliotecas para detecção de fraude
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# importe bibliotecas para pré-processamento
from sklearn.model_selection import train_test_split

# biblioteca para construção de modelos de machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

# bibliotecas para avaliar modelos de machine learning
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score

# importe bibliotecas para construir dados desbalanceados
from imblearn.over_sampling import SMOTE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv/transferencias.csv')
df.head()

Unnamed: 0,Timestamp,pais,cidade,bairro,cep,ip,dia,hora,minuto,so,...,android,ios,compras,hist_navegacao,relacionamento,ind_seguranca,tempo_transacao,lim_crt,hist_saldo,Target
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
# informações sobre dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Timestamp        284807 non-null  float64
 1   pais             284807 non-null  float64
 2   cidade           284807 non-null  float64
 3   bairro           284807 non-null  float64
 4   cep              284807 non-null  float64
 5   ip               284807 non-null  float64
 6   dia              284807 non-null  float64
 7   hora             284807 non-null  float64
 8   minuto           284807 non-null  float64
 9   so               284807 non-null  float64
 10  valor            284807 non-null  float64
 11  antecedentes     284807 non-null  float64
 12  reclamacoes      284807 non-null  float64
 13  qtd_trans        284807 non-null  float64
 14  crt              284807 non-null  float64
 15  limite_global    284807 non-null  float64
 16  tipo_crt         284807 non-null  floa

In [None]:
# valores nulos
df.isnull().sum()

Unnamed: 0,0
Timestamp,0
pais,0
cidade,0
bairro,0
cep,0
ip,0
dia,0
hora,0
minuto,0
so,0


In [None]:
# checar a variavel Target
df['Target'].value_counts()
#sns.countplot(df['Target'])

Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
0,284315
1,492


In [None]:
# correlações com o Target
df.corr()['Target'].sort_values(ascending=False)

Unnamed: 0,Target
Target,1.0
antecedentes,0.154876
cep,0.133447
cidade,0.091289
android,0.040413
emprestimos,0.034783
navegador,0.02009
minuto,0.019875
tempo_transacao,0.01758
lim_crt,0.009536


In [None]:
# correlação de spearman
df.corr(method='spearman')

Unnamed: 0,Timestamp,pais,cidade,bairro,cep,ip,dia,hora,minuto,so,...,android,ios,compras,hist_navegacao,relacionamento,ind_seguranca,tempo_transacao,lim_crt,hist_saldo,Target
Timestamp,1.0,0.190647,-0.006419,-0.473348,-0.142156,0.226004,-0.111408,0.116722,-0.103882,0.017986,...,0.096564,0.132651,0.148505,-0.01598,-0.25922,0.000132,-0.035063,-0.1432,-0.040007,-0.011692
pais,0.190647,1.0,-0.384328,-0.412095,0.101011,-0.106632,-0.086561,-0.22765,-0.23614,0.138614,...,-0.061958,-0.028827,0.14544,-0.000315,0.049291,0.00638,-0.104815,-0.207967,-0.086663,-0.042424
cidade,-0.006419,-0.384328,1.0,0.073967,0.155042,0.342701,-0.110393,0.42632,0.00698,-0.07788,...,-0.098255,-0.056478,-0.094982,0.012617,-0.085682,0.033865,0.129234,0.137976,-0.502052,0.051062
bairro,-0.473348,-0.412095,0.073967,1.0,0.060812,-0.223849,0.171263,-0.074582,0.164374,-0.030876,...,-0.040234,0.004404,-0.140076,-0.000559,0.031854,-0.016689,0.140724,0.180169,-0.000236,-0.059278
cep,-0.142156,0.101011,0.155042,0.060812,1.0,0.021367,0.075185,0.05992,-0.009608,0.105248,...,0.03896,-0.004931,0.009615,-0.008782,0.060647,-0.064357,-0.033634,0.004757,-0.022117,0.063045
ip,0.226004,-0.106632,0.342701,-0.223849,0.021367,1.0,-0.007577,0.431573,-0.057022,-0.05658,...,-0.040364,-0.01092,-0.1083,-0.023076,-0.015618,0.029513,0.012883,-0.027806,-0.314061,-0.030147
dia,-0.111408,-0.086561,-0.110393,0.171263,0.075185,-0.007577,1.0,-0.29268,0.444053,0.028864,...,0.034595,0.04037,-0.050293,-0.135166,-0.011131,-0.023932,0.073772,0.020459,0.205452,-0.03841
hora,0.116722,-0.22765,0.42632,-0.074582,0.05992,0.431573,-0.29268,1.0,-0.389213,-0.100252,...,-0.02225,-0.02375,-0.186343,-0.012879,-0.046468,0.016724,-0.094494,0.056405,-0.030003,-0.048308
minuto,-0.103882,-0.23614,0.00698,0.164374,-0.009608,-0.057022,0.444053,-0.389213,1.0,-0.002511,...,0.076569,0.019775,0.075212,-0.024536,-0.074303,-0.007648,0.189916,0.0897,0.004371,0.022706
so,0.017986,0.138614,-0.07788,-0.030876,0.105248,-0.05658,0.028864,-0.100252,-0.002511,1.0,...,-0.065773,0.001958,0.074164,-0.001572,-0.022575,0.026677,-0.001804,-0.103743,-0.077437,-0.049499


In [None]:
# analise descritiva
df.describe()

Unnamed: 0,Timestamp,pais,cidade,bairro,cep,ip,dia,hora,minuto,so,...,android,ios,compras,hist_navegacao,relacionamento,ind_seguranca,tempo_transacao,lim_crt,hist_saldo,Target
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


# Data Pre-Processing

* Is there class (target) imbalance?
* It can also be tricky to deal with outliers if we don't understand the problem domain and context well enough.



In [None]:
# separar dados para treino e teste
X = df.drop(columns=['Target'], axis=1)
y = df['Target']

In [None]:
# balanceamento com smote
# criar registros da classe positiva (1)
smt = SMOTE(random_state=42)
X, y = smt.fit_resample(X, y)

In [None]:
# contagem do target após o balanceamento: use bincount
print(np.bincount(y))


[284315 284315]


The stratify=y argument in scikit-learn's train_test_split function is used to ensure that the proportion of the target class (dependent variable) is maintained in the training set and test set. This is especially important when you are dealing with imbalanced data, that is, when the classes of the target variable are not represented equally in the data set.

In [None]:
# separar dados de treino e validação
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


# Creating the Predictive Fraud Detection Machine

#XGBoost - Accuracy:  0.9998124146340034 - F1-score: 1.0
* We created the Predictive Machine and fit the model to the data we prepared for training.

* Then we proceed to classify with the data we have prepared for evaluation.

* Lastly, we evaluate model performance with key classification metrics.

In [None]:
# construa o modelo xgboost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_model

In [None]:
# passando os dados de teste para o modelo xgb_model
y_pred_xgb = xgb_model.predict(X_val)
y_pred_xgb

array([1, 0, 0, ..., 1, 0, 0])

## EVALUATING XGBOOST

In [None]:
# comparando os dados de validação com os dados de predição
validação = pd.DataFrame({'y_val': y_val, 'y_pred_xgb': y_pred_xgb})
validação

Unnamed: 0,y_val,y_pred_xgb
363369,1,1
158118,0,0
202455,0,0
115521,0,0
193355,0,0
...,...,...
185330,0,0
84822,0,0
447075,1,1
123340,0,0


In [None]:
# avalie o modelo com métricas, acurácia e matrix de confusão
print(classification_report(y_val, y_pred_xgb))
print('Acurácia: ', accuracy_score(y_val, y_pred_xgb))
#confusion matrix
print(confusion_matrix(y_val, y_pred_xgb))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       1.00      1.00      1.00     85294

    accuracy                           1.00    170589
   macro avg       1.00      1.00      1.00    170589
weighted avg       1.00      1.00      1.00    170589

Acurácia:  0.9998124146340034
[[85264    31]
 [    1 85293]]


#LightGBM - Accuracy:  0.9849521364214574 - f1-score: 0.99


In [None]:
# crie os dados de treino para o lightgbm
lgb_train = lgb.Dataset(X_train, y_train)

In [None]:
# setting parameter for lightgbm
params = {'num_leaves':1000, 'objective':'binary', 'max_depth':7, 'learning_rate':.01, 'max_bin':200}
params['metric'] = ['auc', 'binary_logloss']

In [None]:
# treinar o modelo com lightgbm
lgb_model = lgb.train(params, lgb_train, num_boost_round=50)


[LightGBM] [Info] Number of positive: 199021, number of negative: 199020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.117686 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6000
[LightGBM] [Info] Number of data points in the train set: 398041, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500001 -> initscore=0.000005
[LightGBM] [Info] Start training from score 0.000005


In [None]:
# Passando dados de validação para o modelo
y_pred_lgb = lgb_model.predict(X_val)

## EVALUATING LGBM

In [None]:
# comparando os dados de validação com os dados de predição
validação = pd.DataFrame({'y_val': y_val, 'y_pred_lgb': y_pred_lgb})
validação

Unnamed: 0,y_val,y_pred_lgb
363369,1,0.696905
158118,0,0.304971
202455,0,0.337166
115521,0,0.329696
193355,0,0.313699
...,...,...
185330,0,0.304971
84822,0,0.304971
447075,1,0.696905
123340,0,0.304971


In [None]:
y_pred_lgb.size

170589

In [None]:
# converter probabilidades em 0 ou 1
for i in range(0, 170589):
    if y_pred_lgb[i] >= 0.5:   # set threshold to 0.5
        y_pred_lgb[i] = 1
    else:
      y_pred_lgb[i]=0

In [None]:
# comparando os dados de validação com os dados de predição
validação_probabilidade = pd.DataFrame({'y_val': y_val, 'y_pred_lgb': y_pred_lgb})
validação_probabilidade

Unnamed: 0,y_val,y_pred_lgb
363369,1,1.0
158118,0,0.0
202455,0,0.0
115521,0,0.0
193355,0,0.0
...,...,...
185330,0,0.0
84822,0,0.0
447075,1,1.0
123340,0,0.0


In [None]:
# avalie o modelo com métricas, acurácia e matrix de confusão
print(classification_report(y_val, y_pred_lgb))
print('Acurácia: ', accuracy_score(y_val, y_pred_lgb))
#confusion matrix
print(confusion_matrix(y_val, y_pred_lgb))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99     85295
           1       0.99      0.98      0.98     85294

    accuracy                           0.98    170589
   macro avg       0.99      0.98      0.98    170589
weighted avg       0.99      0.98      0.98    170589

Acurácia:  0.9849521364214574
[[84591   704]
 [ 1863 83431]]


# Random Forest - 0.9998300007620655 - f1-score: 1.0

In [None]:
# construindo modelo com randomforest
rf_model = RandomForestClassifier(random_state=42)
rf_model = rf_model.fit(X_train, y_train)
rf_model

In [None]:
# passando dados de validação para a predição
y_pred_rf = rf_model.predict(X_val)

## EVALUATING RANDOM FOREST

In [None]:
# comparando os dados de validação com os dados de predição
validação = pd.DataFrame({'y_val': y_val, 'y_pred_rf': y_pred_rf})
validação

Unnamed: 0,y_val,y_pred_rf
363369,1,1
158118,0,0
202455,0,0
115521,0,0
193355,0,0
...,...,...
185330,0,0
84822,0,0
447075,1,1
123340,0,0


In [None]:
# avalie o modelo randomforest quanto a metricas, acuracia e confusion matrix
print(classification_report(y_val, y_pred_rf))
print('Acurácia: ', accuracy_score(y_val, y_pred_rf))
#confusion matrix
print(confusion_matrix(y_val, y_pred_rf))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       1.00      1.00      1.00     85294

    accuracy                           1.00    170589
   macro avg       1.00      1.00      1.00    170589
weighted avg       1.00      1.00      1.00    170589

Acurácia:  0.9998300007620655
[[85268    27]
 [    2 85292]]


# Saving the model

In [None]:
# Salvando o modelo
import joblib
joblib.dump(rf_model, 'modelo_rf.pkl')

['modelo_rf.pkl']

In [None]:
# usando o modelo pkl
modelo_carregado = joblib.load('modelo_rf.pkl')

In [None]:
modelo_carregado.predict(X_test)

array([1, 0, 0, ..., 1, 0, 0])