In [20]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split ,StratifiedKFold, cross_val_score 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

In [4]:
# Upload do dataset e leitura inicial dos dados 
df = pd.read_csv('fraud_dataset.csv')

In [5]:
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [6]:
df.isnull().sum()

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

In [7]:
df.shape

(1000000, 8)

In [8]:
# Verificando se o dataset está desbalanceado
round(df['fraud'].value_counts(normalize=True)*100,2)

fraud
0.0    91.26
1.0     8.74
Name: proportion, dtype: float64

In [9]:
# Divisão da base de treino e teste antes da padronização

X = df.drop('fraud', axis=1)
y = df['fraud'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [10]:
print(X_train.shape, X_test.shape)

(800000, 7) (200000, 7)


In [11]:
# Padronização

scaler = StandardScaler()

scaler.fit(X_train)

X_train_escalonado = scaler.transform(X_train)
X_teste_escalonado = scaler.transform(X_test)

In [13]:
# Aplicação do Oversampling com SMOTE

smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_escalonado, y_train)

In [14]:
print(pd.Series(y_train_resampled).value_counts(normalize=True)*100)

fraud
0.0    50.0
1.0    50.0
Name: proportion, dtype: float64


In [17]:
# Modelo de Regressão Logística e validação cruzada

# Modelo
model = LogisticRegression(penalty='l2')

# Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv = cv, scoring='roc_auc')

In [18]:
print(scores.mean())

0.9793964360941153


In [None]:
# Treinamento do modelo
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_teste_escalonado)

In [None]:
# Métricas de avaliação
auc_score = roc_auc_score(y_test, y_pred)

print(classification_report(y_test, y_pred))

print (auc_score.mean())

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    182519
         1.0       0.58      0.95      0.72     17481

    accuracy                           0.93    200000
   macro avg       0.79      0.94      0.84    200000
weighted avg       0.96      0.93      0.94    200000

0.9408199126613486


In [None]:
# Precisão está com overfitting na classe majoritária e a classe minoritária está muito baixo, considerar acima de 80% para um modelo mais eficiente para deploy

# Revisar hiperparametros e testar outros modelos como Random Forest ou XGBoost pelo desequilibrio das classes