In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import (train_test_split, StratifiedShuffleSplit, 
                                     RepeatedStratifiedKFold, GridSearchCV)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

from imblearn.over_sampling import SMOTE


DATA_PATH = '../../data/creditcard.csv'
RANDOM_STATE = 42
TEST_SIZE = 0.25

### Table of Content
insert content in here!!

In [2]:
df = pd.read_csv(DATA_PATH)

## Info about the dataset

- **Time:** Feature “Time” denotes the seconds elapsed between each transaction and the first transaction in the dataset

- **Features:** The meaning of most variables is not revealed and the features have been transformed by means of principal components. The cardholder identifier is also not available so each transaction can be considered independent from the others.

- **Amount:** The feature “Amount” is the transaction amount, which can be used for example-dependent cost-sensitive learning.

- **Class:** Feature “Class” is the response variable and it takes value 1 in case of fraud and 0 otherwise

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
print(f'''Dataset Shape
    rows:    {df.shape[0]:,}
    columns: {df.shape[1]}
Classes unbalance in number
    Non Fraud (0): {df['Class'].value_counts()[0]:,}
    Fraud (1):     {df['Class'].value_counts()[1]:,}
Classes unbalance in percentage (%)
    Non Fraud (0): {(df['Class'].value_counts() / df.shape[0] * 100)[0]:.4f}%
    Fraud (1):     {(df['Class'].value_counts() / df.shape[0] * 100)[1]:.4f}%
''')

Dataset Shape
    rows:    284,807
    columns: 31
Classes unbalance in number
    Non Fraud (0): 284,315
    Fraud (1):     492
Classes unbalance in percentage (%)
    Non Fraud (0): 99.8273%
    Fraud (1):     0.1727%



In [5]:
X, y = df.drop(columns=['Class']), df['Class']

## Resampling

In [6]:
# Define the resampling method
method = SMOTE(
    sampling_strategy='auto',
    random_state=RANDOM_STATE,
    k_neighbors=5,
    n_jobs=-1,
)
X_resampled, y_resampled = method.fit_resample(X, y)

In [7]:
print(f'''X Shape
    rows:    {X_resampled.shape[0]:,}
    columns: {X_resampled.shape[1]}
''')
print(f'''y Shape
    rows:    {y_resampled.shape[0]:,}
''')
print(f'''Classes unbalance in number
    Non Fraud (0): {y_resampled.value_counts()[0]:,}
    Fraud (1):     {y_resampled.value_counts()[1]:,}
''')
print(f'''Classes unbalance in percentage (%)
    Non Fraud (0): {(y_resampled.value_counts() / df.shape[0] * 100)[0]:.4f}%
    Fraud (1):     {(y_resampled.value_counts() / df.shape[0] * 100)[1]:.4f}%
''')

X Shape
    rows:    568,630
    columns: 30

y Shape
    rows:    568,630

Classes unbalance in number
    Non Fraud (0): 284,315
    Fraud (1):     284,315

Classes unbalance in percentage (%)
    Non Fraud (0): 99.8273%
    Fraud (1):     99.8273%



## Split training and test data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(
    X_resampled, y_resampled, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [9]:
## non-resampled data split
print(f'''X_train / X_test Shape
    rows:    {X_train.shape[0]:,} / {X_test.shape[0]:,}
    columns: {X_train.shape[1]} / {X_test.shape[1]}
''')
print(f'''y_train / y_test Shape
    rows:    {y_train.shape[0]:,} / {y_test.shape[0]:,}
''')
print(f'''Classes unbalance in number for y_train / y_test
    Non Fraud (0): {y_train.value_counts()[0]:,} / {y_test.value_counts()[0]:,}
    Fraud (1):     {y_train.value_counts()[1]:,} / {y_test.value_counts()[1]:,}
''')
print(f'''Classes unbalance in percentage (%) for y_train / y_test
    Non Fraud (0): {(y_train.value_counts() / df.shape[0] * 100)[0]:.4f}% / {(y_test.value_counts() / df.shape[0] * 100)[0]:.4f}%
    Fraud (1):     {(y_train.value_counts() / df.shape[0] * 100)[1]:.4f}% / {(y_test.value_counts() / df.shape[0] * 100)[1]:.4f}%
''')

X_train / X_test Shape
    rows:    213,605 / 71,202
    columns: 30 / 30

y_train / y_test Shape
    rows:    213,605 / 71,202

Classes unbalance in number for y_train / y_test
    Non Fraud (0): 213,226 / 71,089
    Fraud (1):     379 / 113

Classes unbalance in percentage (%) for y_train / y_test
    Non Fraud (0): 74.8668% / 24.9604%
    Fraud (1):     0.1331% / 0.0397%



In [10]:
## resampled data split
print(f'''X_train_resampled / X_test_resampled Shape
    rows:    {X_train_resampled.shape[0]:,} / {X_test_resampled.shape[0]:,}
    columns: {X_train_resampled.shape[1]} / {X_test_resampled.shape[1]}
''')
print(f'''y_train_resampled / y_test_resampled Shape
    rows:    {y_train_resampled.shape[0]:,} / {y_test_resampled.shape[0]:,}
''')
print(f'''Classes unbalance in number for y_train_resampled / y_test_resampled
    Non Fraud (0): {y_train_resampled.value_counts()[0]:,} / {y_test_resampled.value_counts()[0]:,}
    Fraud (1):     {y_train_resampled.value_counts()[1]:,} / {y_test_resampled.value_counts()[1]:,}
''')
print(f'''Classes unbalance in percentage (%) for y_train_resampled / y_test_resampled
    Non Fraud (0): {(y_train_resampled.value_counts() / df.shape[0] * 100)[0]:.4f}% / {(y_test_resampled.value_counts() / df.shape[0] * 100)[0]:.4f}%
    Fraud (1):     {(y_train_resampled.value_counts() / df.shape[0] * 100)[1]:.4f}% / {(y_test_resampled.value_counts() / df.shape[0] * 100)[1]:.4f}%
''')

X_train_resampled / X_test_resampled Shape
    rows:    426,472 / 142,158
    columns: 30 / 30

y_train_resampled / y_test_resampled Shape
    rows:    426,472 / 142,158

Classes unbalance in number for y_train_resampled / y_test_resampled
    Non Fraud (0): 213,292 / 71,023
    Fraud (1):     213,180 / 71,135

Classes unbalance in percentage (%) for y_train_resampled / y_test_resampled
    Non Fraud (0): 74.8900% / 24.9372%
    Fraud (1):     74.8507% / 24.9766%



## ML models

In [11]:
lr = LogisticRegression(random_state=RANDOM_STATE,max_iter=1000, n_jobs=-1)\
    .fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [12]:
metrics = {}
metrics['f_score'] = f1_score(y_test, y_pred)
metrics['recall'] = recall_score(y_test, y_pred)
metrics['precision'] = precision_score(y_test, y_pred)
metrics['g_mean'] = np.sqrt(metrics['recall'] * metrics['precision'])
metrics['roc_auc_score'] = roc_auc_score(y_test, y_pred)

metrics

{'f_score': 0.6421052631578946,
 'recall': 0.5398230088495575,
 'precision': 0.7922077922077922,
 'g_mean': 0.6539510639364963,
 'roc_auc_score': 0.7697989694334297}

In [13]:
lr_resampled = LogisticRegression(random_state=RANDOM_STATE,max_iter=1000, n_jobs=-1)\
    .fit(X_train_resampled, y_train_resampled)
y_pred_resampled = lr_resampled.predict(X_test_resampled)

In [14]:
metrics_resampled = {}
metrics_resampled['f_score'] = f1_score(y_test_resampled, y_pred_resampled)
metrics_resampled['recall'] = recall_score(y_test_resampled, y_pred_resampled)
metrics_resampled['precision'] = precision_score(y_test_resampled, y_pred_resampled)
metrics_resampled['g_mean'] = np.sqrt(metrics_resampled['recall'] * metrics_resampled['precision'])
metrics_resampled['roc_auc_score'] = roc_auc_score(y_test_resampled, y_pred_resampled)

metrics_resampled

{'f_score': 0.9735067881322889,
 'recall': 0.9651929429957123,
 'precision': 0.9819651029748284,
 'g_mean': 0.9735429048888202,
 'roc_auc_score': 0.9737190655870948}