# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

from sklearn.preprocessing import OneHotEncoder
import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.model.functions import train_and_evaluate_iforest

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8]
trees = [1, 5, 25, 50, 75, 100]

## Credit Card

**Dataset source**: https://www.kaggle.com/mlg-ulb/creditcardfraud

**Additional sources:**

Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

Dal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon

Dal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE

Dal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)

Carcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier

Carcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing

Bertrand Lebichot, Yann-Aël Le Borgne, Liyun He, Frederic Oblé, Gianluca Bontempi Deep-Learning Domain Adaptation Techniques for Credit Cards Fraud Detection, INNSBDDL 2019: Recent Advances in Big Data and Deep Learning, pp 78-88, 2019

Fabrizio Carcillo, Yann-Aël Le Borgne, Olivier Caelen, Frederic Oblé, Gianluca Bontempi Combining Unsupervised and Supervised Learning in Credit Card Fraud Detection Information Sciences, 2019

Yann-Aël Le Borgne, Gianluca Bontempi Machine Learning for Credit Card Fraud Detection - Practical Handbook

In [6]:
dataset_id = 'creditcard'
data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(284807, 29)

In [7]:
pd.pivot_table(data,
             values = 'V1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,V1
y,Unnamed: 1_level_1
0,284315
1,492


In [8]:
excluded_cols = []

### iForest

## Parameters

In [9]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

fi_shap_all = pd.read_parquet(path_fi_shap)

## Model training with iterations HPO and FS

In [10]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, fi_df=fi_shap_all, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)
df['shap_q1'] = df['shap_stab'].apply(lambda x: np.percentile(x, 75))
df['shap_q2'] = df['shap_stab'].apply(lambda x: np.percentile(x, 50))
df['shap_q3'] = df['shap_stab'].apply(lambda x: np.percentile(x, 25))

df['shap_ad_q1'] = df['shap_stab_ad'].apply(lambda x: np.percentile(x, 75))
df['shap_ad_q2'] = df['shap_stab_ad'].apply(lambda x: np.percentile(x, 50))
df['shap_ad_q3'] = df['shap_stab_ad'].apply(lambda x: np.percentile(x, 25))

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")

Iteration by tree number: 1
  Iteration by contamination: 0.04
    Number of featured: 2
     Iteration by feat number: 2
    Number of featured: 4
     Iteration by feat number: 4
    Number of featured: 6
     Iteration by feat number: 6
    Number of featured: 9
     Iteration by feat number: 9
    Number of featured: 11
     Iteration by feat number: 11
    Number of featured: 14
     Iteration by feat number: 14
    Number of featured: 17
     Iteration by feat number: 17
    Number of featured: 20
     Iteration by feat number: 20
    Number of featured: 24
     Iteration by feat number: 24
    Number of featured: 28
     Iteration by feat number: 28
    Number of featured: 28
     Iteration by feat number: 28
Iteration by tree number: 5
  Iteration by contamination: 0.04
    Number of featured: 2
     Iteration by feat number: 2
    Number of featured: 4
     Iteration by feat number: 4
    Number of featured: 6
     Iteration by feat number: 6
    Number of featured: 9
     Ite

In [11]:
df

Unnamed: 0,n_estimators,max_feats,contamination,n_feats,n_iter,n_iter_fs,roc_auc,model_stab,shap_stab,shap_stab_ad,f1_median,recall,precision,confusion_matrix,shap_q1,shap_q2,shap_q3,shap_ad_q1,shap_ad_q2,shap_ad_q3
0,1,2,0.04,2,1,1,0.816016,0.836016,"[0.0002472115422974541, 0.020446319741590835, ...","[0.0002472115422974541, 1.0, 0.083711434129881...",0.000000,0.000000,0.000000,1,0.400148,0.083711,0.020446,1.000000,0.083711,0.083711
1,1,4,0.04,4,1,1,0.743089,0.820371,"[0.19958265551253485, 0.36089172884314824, 0.3...","[0.40486635410133487, 0.42905306338985416, 0.7...",0.000000,0.000000,0.000000,1,0.488673,0.354720,0.168672,0.955761,0.704608,0.422802
2,1,6,0.04,6,1,1,0.798729,0.804685,"[0.26717737657240626, 0.4012410550970008, 0.26...","[0.5257387900349739, 0.637959085994019, 0.8878...",0.000000,0.000000,0.000000,1,0.567913,0.482596,0.426155,0.884748,0.772252,0.562783
3,1,9,0.04,9,1,1,0.857441,0.812805,"[0.3734526390136692, 0.5779261044149511, 0.465...","[0.46412069117180466, 0.5638627801569716, 0.80...",0.000000,0.000000,0.000000,1,0.536198,0.483637,0.429061,0.836444,0.774592,0.563863
4,1,11,0.04,11,1,1,0.484236,0.805148,"[0.6066086849740002, 0.528559499174339, 0.5887...","[0.6252968950670175, 0.41537309276735845, 0.78...",0.000000,0.000000,0.000000,1,0.577875,0.530540,0.486200,0.729200,0.694773,0.594055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,100,17,0.04,17,1,1,0.948050,0.983021,"[0.9193203657092407, 0.9149666461066488, 0.936...","[0.8909063987749885, 0.9484568095627717, 0.917...",0.069782,0.837563,0.036408,1,0.937007,0.905889,0.865898,0.947700,0.924099,0.906893
62,100,20,0.04,20,1,1,0.939448,0.981450,"[0.915875088599827, 0.8730598050974154, 0.9455...","[0.9607910231186958, 0.8831566577907408, 0.839...",0.066958,0.817259,0.034909,1,0.922714,0.890693,0.846839,0.925209,0.898485,0.859043
63,100,24,0.04,24,1,1,0.933684,0.983064,"[0.9252315018671834, 0.89084275639985, 0.89691...","[0.928591396785613, 0.8596013983839405, 0.9364...",0.066542,0.812183,0.034692,1,0.911987,0.880061,0.838350,0.936449,0.915396,0.898025
64,100,28,0.04,28,1,1,0.942350,0.980799,"[0.889118465505014, 0.8588835813037821, 0.9098...","[0.9308200099219291, 0.9224715165131191, 0.877...",0.067336,0.817259,0.035115,1,0.899227,0.865962,0.824446,0.919032,0.890392,0.858145


In [12]:
df.to_parquet(path_shap)