In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import (train_test_split, RepeatedStratifiedKFold, 
                                     GridSearchCV)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (f1_score, precision_score, recall_score, 
                             roc_auc_score)

from imblearn.over_sampling import RandomOverSampler

from utils import get_data_metrics, get_model_metrics, sort_model_metrics, get_model_metric_comparison

In [2]:
## Globals
RANDOM_STATE = 42
TEST_SIZE = 0.33

if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  data_path = 'drive/MyDrive/BigDataMaster/TFM/data/creditcard.csv'
else:
  data_path = '../../data/creditcard.csv'

In [3]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
X, y = df.drop(columns=['Class']), df['Class']

In [5]:
## before resampling
print(f'''Dataset Shape
    rows:    {df.shape[0]:,}
    columns: {df.shape[1]}
Classes unbalance in number
    Non Fraud (0): {df['Class'].value_counts()[0]:,}
    Fraud (1):     {df['Class'].value_counts()[1]:,}
Classes unbalance in percentage (%)
    Non Fraud (0): {(df['Class'].value_counts() / df.shape[0] * 100)[0]:.4f}%
    Fraud (1):     {(df['Class'].value_counts() / df.shape[0] * 100)[1]:.4f}%
''')

Dataset Shape
    rows:    284,807
    columns: 31
Classes unbalance in number
    Non Fraud (0): 284,315
    Fraud (1):     492
Classes unbalance in percentage (%)
    Non Fraud (0): 99.8273%
    Fraud (1):     0.1727%



In [6]:
ros = RandomOverSampler(random_state=RANDOM_STATE)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [7]:
## after resampling
get_data_metrics(y_resampled, 'main')

[{'class': 0,
  'count': 284315,
  'percent': 0.5,
  'subset': 'main',
  'is_resampled': False,
  'res_method': ''},
 {'class': 1,
  'count': 284315,
  'percent': 0.5,
  'subset': 'main',
  'is_resampled': False,
  'res_method': ''}]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [9]:
get_data_metrics(y_train, 'train'), get_data_metrics(y_test, 'test')

([{'class': 0,
   'count': 190477,
   'percent': 0.9982024944974321,
   'subset': 'train',
   'is_resampled': False,
   'res_method': ''},
  {'class': 1,
   'count': 343,
   'percent': 0.001797505502567865,
   'subset': 'train',
   'is_resampled': False,
   'res_method': ''}],
 [{'class': 1,
   'count': 149,
   'percent': 0.0015853256301403386,
   'subset': 'test',
   'is_resampled': False,
   'res_method': ''},
  {'class': 0,
   'count': 93838,
   'percent': 0.9984146743698596,
   'subset': 'test',
   'is_resampled': False,
   'res_method': ''}])

In [10]:
lr = LogisticRegression(random_state=RANDOM_STATE,max_iter=1000, n_jobs=-1)\
    .fit(X_train, y_train)
y_pred = lr.predict(X_test)

metrics = {}
metrics['f_score'] = f1_score(y_test, y_pred)
metrics['recall'] = recall_score(y_test, y_pred)
metrics['precision'] = precision_score(y_test, y_pred)
metrics['g_mean'] = np.sqrt(metrics['recall'] * metrics['precision'])
metrics['roc_auc_score'] = roc_auc_score(y_test, y_pred)

sort_model_metrics(metrics)

[('precision', 0.8411214953271028),
 ('roc_auc_score', 0.801922841178092),
 ('g_mean', 0.7127832515010419),
 ('f_score', 0.7031249999999999),
 ('recall', 0.6040268456375839)]

In [11]:
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(
    X_resampled, y_resampled, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [12]:
get_data_metrics(y_train_resampled, 'train'), get_data_metrics(y_test_resampled, 'test')

([{'class': 0,
   'count': 190540,
   'percent': 0.5001286150001837,
   'subset': 'train',
   'is_resampled': False,
   'res_method': ''},
  {'class': 1,
   'count': 190442,
   'percent': 0.49987138499981626,
   'subset': 'train',
   'is_resampled': False,
   'res_method': ''}],
 [{'class': 0,
   'count': 93873,
   'percent': 0.5002611272169167,
   'subset': 'test',
   'is_resampled': False,
   'res_method': ''},
  {'class': 1,
   'count': 93775,
   'percent': 0.4997388727830832,
   'subset': 'test',
   'is_resampled': False,
   'res_method': ''}])

In [13]:
lr_resampled = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, n_jobs=-1)\
    .fit(X_train_resampled, y_train_resampled)
y_pred_resampled = lr_resampled.predict(X_test_resampled)

metrics_resampled = {}
metrics_resampled['f_score'] = f1_score(y_test_resampled, y_pred_resampled)
metrics_resampled['recall'] = recall_score(y_test_resampled, y_pred_resampled)
metrics_resampled['precision'] = precision_score(y_test_resampled, y_pred_resampled)
metrics_resampled['g_mean'] = np.sqrt(metrics_resampled['recall'] * metrics_resampled['precision'])
metrics_resampled['roc_auc_score'] = roc_auc_score(y_test_resampled, y_pred_resampled)

sort_model_metrics(metrics_resampled)

[('precision', 0.9629558778849292),
 ('roc_auc_score', 0.9382493351660505),
 ('g_mean', 0.9369281757914609),
 ('f_score', 0.9365765568567364),
 ('recall', 0.9116039755840337)]

In [14]:
get_model_metric_comparison(metrics, metrics_resampled)

{'precision': {'basic_points': 0.12183438255782642,
  'percentage': 0.14484754370763808},
 'roc_auc_score': {'basic_points': 0.13632649398795849,
  'percentage': 0.16999951490056503},
 'g_mean': {'basic_points': 0.224144924290419,
  'percentage': 0.3144643533898907},
 'f_score': {'basic_points': 0.23345155685673646,
  'percentage': 0.33201999197402526},
 'recall': {'basic_points': 0.3075771299464498,
  'percentage': 0.509211026244678}}