In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import os, sys
sys.path.insert(0, '..')
from src.utils import performance_rank_df, performance_rank_n, performance_rank_f1_opt
from src.utils import plot_rank_precision_recall, plot_precision_recall

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score
from sklearn import metrics

# Read Data

In [5]:
df = pd.read_csv('../data/raw/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Data split

In [6]:
y=df['Class']
X=df.drop('Class',1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model

## Model hyperparameter optimization

In [14]:
%%time
max_samples_param = [pow(2,x) for x in range(8,18,1)]

max_samples_score = []
for param in max_samples_param:
    clf = IsolationForest(random_state=42, max_samples = param, n_estimators = 500, n_jobs = -1)
    clf.fit(X_train[y_train == 0]) # train with inliers

    # predictions (soft)
    y_score_train = clf.score_samples(X_train) 

    # compute performance
    df_pf_train = performance_rank_df(y_train, y_score_train, if_score = True)

    print(param, max_samples_score, performance_rank_f1_opt(df_pf_train).F1_score)
    max_samples_score.append(performance_rank_f1_opt(df_pf_train).F1_score)


In [13]:
list(zip(max_samples_param, max_samples_score))

[(256, 0.2222222222222222),
 (512, 0.25763612217795484),
 (1024, 0.27578475336322866),
 (2048, 0.2751322751322751),
 (4096, 0.3004587155963303),
 (8192, 0.33405639913232105),
 (16384, 0.3738872403560831),
 (32768, 0.41218274111675124),
 (65536, 0.4225053078556263),
 (131072, 0.4508928571428571)]

## Final model

In [15]:
# training the model
clf_final = IsolationForest(random_state=42, max_samples = max_samples_param[np.argmax(max_samples_score)])
clf_final.fit(X_train[y_train == 0]) # train with inliers

IsolationForest(max_samples=131072, random_state=42)

In [16]:
# predictions (soft)
y_score_train = clf_final.score_samples(X_train)
y_score_test = clf_final.score_samples(X_test)

In [17]:
df_pf_test = performance_rank_df(y_test, y_score_test, if_score = True)
performance_rank_n(df_pf_test)

Unnamed: 0_level_0,Precision,Recall,F1_score
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,0.42,0.428571,0.424242
500,0.142,0.72449,0.237458
1000,0.079,0.806122,0.143898
10000,0.0091,0.928571,0.018023


In [20]:
fpr, tpr, thresholds = metrics.roc_curve(np.array(y_test), -y_score_test, pos_label=1)
metrics.auc(fpr, tpr)

0.954115009819347

In [None]:
clean = y_score_test[y_test==0]
fraud = y_score_test[y_test==1]

fig, ax = plt.subplots()
ax.hist(clean, bins=50, density=True, label="clean", alpha=.6, color="green")
ax.hist(fraud, bins=50, density=True, label="fraud", alpha=.6, color="red")

plt.title("(Normalized) Distribution of the Reconstruction Loss")
plt.xlabel('Error')
plt.legend()
plt.show()