In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )
from sklearn.ensemble import ExtraTreesClassifier
from category_encoders.binary import BinaryEncoder
import xgboost as xgb
import lightgbm as lgbm
%matplotlib inline

In [2]:
%load_ext autoreload

%autoreload 2

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

# Synthetic Fraud data

### Load data

In [4]:
fraud_data = pd.read_csv('data/PS_20174392719_1491204439457_log.csv')

### Encode categorical variables in numeric or binary format depending on cardinality

XGBoost does not accept categorical values in integer form, so we have to encode them differently. For comparison, the key values for the ExtraTrees classifier (out-of-box, with balanced weights) were Precision 1.0, Recall 0.6939. It will be interesting to see how the binary encoding changes these values.  


In [5]:
fraud_data['type_enc'] = LabelEncoder().fit_transform(fraud_data['type'])

In [6]:
benc = BinaryEncoder(cols=['nameOrig', 'nameDest'])

In [7]:
fraud_data.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud', 'type_enc'],
      dtype='object')

In [8]:
fraud_data = benc.fit_transform(fraud_data)

In [9]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 57 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig_0      int64  
 4   nameOrig_1      int64  
 5   nameOrig_2      int64  
 6   nameOrig_3      int64  
 7   nameOrig_4      int64  
 8   nameOrig_5      int64  
 9   nameOrig_6      int64  
 10  nameOrig_7      int64  
 11  nameOrig_8      int64  
 12  nameOrig_9      int64  
 13  nameOrig_10     int64  
 14  nameOrig_11     int64  
 15  nameOrig_12     int64  
 16  nameOrig_13     int64  
 17  nameOrig_14     int64  
 18  nameOrig_15     int64  
 19  nameOrig_16     int64  
 20  nameOrig_17     int64  
 21  nameOrig_18     int64  
 22  nameOrig_19     int64  
 23  nameOrig_20     int64  
 24  nameOrig_21     int64  
 25  nameOrig_22     int64  
 26  nameOrig_23     int64  
 27  oldbalanceOrg   float64
 28  newbalanceOr

In [10]:
train_cols = ['step', 'amount', 'nameOrig_0', 'nameOrig_1', 'nameOrig_2',
       'nameOrig_3', 'nameOrig_4', 'nameOrig_5', 'nameOrig_6', 'nameOrig_7',
       'nameOrig_8', 'nameOrig_9', 'nameOrig_10', 'nameOrig_11', 'nameOrig_12',
       'nameOrig_13', 'nameOrig_14', 'nameOrig_15', 'nameOrig_16',
       'nameOrig_17', 'nameOrig_18', 'nameOrig_19', 'nameOrig_20',
       'nameOrig_21', 'nameOrig_22', 'nameOrig_23', 'oldbalanceOrg',
       'newbalanceOrig', 'nameDest_0', 'nameDest_1', 'nameDest_2',
       'nameDest_3', 'nameDest_4', 'nameDest_5', 'nameDest_6', 'nameDest_7',
       'nameDest_8', 'nameDest_9', 'nameDest_10', 'nameDest_11', 'nameDest_12',
       'nameDest_13', 'nameDest_14', 'nameDest_15', 'nameDest_16',
       'nameDest_17', 'nameDest_18', 'nameDest_19', 'nameDest_20',
       'nameDest_21', 'nameDest_22', 'oldbalanceDest', 'newbalanceDest',
       'type_enc']
label_col = ['isFraud']

### Divide up training and validation data

In [11]:
train_length = np.round(len(fraud_data.index) * 0.9, 0)
train_X = fraud_data.loc[:train_length, train_cols]
train_y = fraud_data.loc[:train_length, label_col]
valid_X = fraud_data.loc[train_length:, train_cols]
valid_y = fraud_data.loc[train_length:, label_col]

### Test new encoding through previously tried and new models

In [27]:
clf = lgbm.LGBMClassifier(n_estimators=1000, random_state=42, class_weight='balanced', objective='binary')

clf.fit(train_X, np.ravel(train_y), categorical_feature=['type_enc'])

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

New categorical_feature is ['type_enc']


Precision:  0.24986512524084778
Recall:  0.861317747077577
ROC score:  0.9229647780633877
F1 score:  0.3873588625365912
Accuracy score:  0.9838824257931481


In [28]:
clf = xgb.XGBClassifier(random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9974838245866283
Recall:  0.7372476089266737
ROC score:  0.8686182708490013
F1 score:  0.8478460128322639
Accuracy score:  0.9984346071272526


In [29]:
clf = ExtraTreesClassifier(n_estimators=250, random_state=42, class_weight='balanced')

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

  _warn_prf(average, modifier, msg_start, len(result))


Precision:  0.0
Recall:  0.0
ROC score:  0.5
F1 score:  0.0
Accuracy score:  0.994084198019055


In [30]:
clf = xgb.XGBClassifier(random_state=42, tree_method='exact')

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9974901398350663
Recall:  0.7391073326248672
ROC score:  0.869548132698098
F1 score:  0.8490767587364565
Accuracy score:  0.9984456088843904


Let's try out the TomekLinks, RandomOverSampler and SMOTETomek. They have a good balance of precision and recall in the imbalanced library. If that doesn't do it, we might have to run a gridsearch on xgboost hyperparameters or try and concoct a new feature. Although I wonder what that is going to be.

In [None]:
from imblearn.under_sampling import TomekLinks

toli = TomekLinks()

train_X_resampled, train_y_resampled = toli.fit_resample(train_X, train_y)

In [14]:
clf = xgb.XGBClassifier(random_state=42)

clf.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.997482919813017
Recall:  0.7369819341126461
ROC score:  0.8684854334419875
F1 score:  0.8476699770817417
Accuracy score:  0.9984330354476615


In [15]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

train_X_resampled, train_y_resampled = ros.fit_resample(train_X, train_y)

clf = xgb.XGBClassifier(random_state=42)

clf.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9084310441094361
Recall:  0.8645058448459086
ROC score:  0.9319936330655176
F1 score:  0.8859243125510482
Accuracy score:  0.9986829325026483


In [16]:
from imblearn.combine import SMOTETomek

smotetomek = SMOTETomek(random_state=42)

train_X_resampled, train_y_resampled = smotetomek.fit_resample(train_X, train_y)

clf = xgb.XGBClassifier(random_state=42)

clf.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8836702954898912
Recall:  0.7547821466524973
ROC score:  0.8770954202174641
F1 score:  0.8141567559822324
Accuracy score:  0.997961531570328
