# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

from sklearn.preprocessing import OneHotEncoder
import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.model.functions import train_and_evaluate_iforest

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8]
trees = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325 ,350, 375, 400, 425, 450, 475, 500, 525, 550, 575, 600, 625, 650, 675, 700, 725, 750, 775, 800, 825, 850, 875, 900, 925, 950, 975, 1000]

#trees = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

## Arrhythmia

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [6]:
dataset_id = 'arrhythmia'

data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(452, 275)

In [7]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,386
1,66


In [8]:
excluded_cols = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141','Col147', 'Col152', 'Col153', 'Col160', 'Col200', 'Col260', 'Col270']

### iForest

## Parameters

In [9]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

fi_shap_all = pd.read_parquet(path_fi_shap)

## Model training with iterations HPO and FS

In [10]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, fi_df=fi_shap_all, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")

Iteration by tree number: 1
  Iteration by contamination: 0.117
    Number of featured: 8
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
     Iteration by feat number: 6
     Iteration by feat number: 8
    Number of featured: 18
     Iteration by feat number: 3
     Iteration by feat number: 7
     Iteration by feat number: 10
     Iteration by feat number: 14
     Iteration by feat number: 18
    Number of featured: 29
     Iteration by feat number: 5
     Iteration by feat number: 11
     Iteration by feat number: 17
     Iteration by feat number: 23
     Iteration by feat number: 29
    Number of featured: 42
     Iteration by feat number: 8
     Iteration by feat number: 16
     Iteration by feat number: 25
     Iteration by feat number: 33
     Iteration by feat number: 42
    Number of featured: 57
     Iteration by feat number: 11
     Iteration by feat number: 22
     Iteration by feat number: 34
     Iteration by feat number

In [11]:
df

Unnamed: 0,n_estimators,max_feats,contamination,n_feats,n_iter,n_iter_fs,roc_auc,model_stab,shap_stab,shap_stab_ad,f1_median,recall,precision,confusion_matrix
0,1,2,0.117,8,1,1,0.586725,0.998827,"[0.9999999998920465, 1.0, 0.8531782109764015, ...","[0.9999999998920465, 0.9999999998920465, 0.999...",0.181818,0.192308,0.172414,1
1,1,3,0.117,8,1,1,0.622581,0.678238,"[0.9999999998920465, 0.9097983834391725, 0.806...","[0.9999999998920465, 0.8531780906355251, 0.848...",0.117647,0.076923,0.250000,1
2,1,4,0.117,8,1,1,0.691811,0.794181,"[0.8909178517028281, 0.9160773893154357, 0.766...","[0.8909178517028281, 0.999966507839134, 0.8482...",0.266667,0.230769,0.315789,1
3,1,6,0.117,8,1,1,0.522208,0.750685,"[0.694003555362308, 0.7769336979945826, 0.7545...","[0.694003555362308, 0.7550428170300026, 0.5423...",0.090909,0.076923,0.111111,1
4,1,8,0.117,8,1,1,0.492184,0.809821,"[0.7689181718953443, 0.6066207613525259, 0.473...","[0.7689181718953443, 0.5628603693971861, 0.490...",0.181818,0.153846,0.222222,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3515,1000,51,0.117,257,1,1,0.811166,0.990005,"[0.9674379954268204, 0.9001130489778402, 0.928...","[0.9674379954268204, 0.9478204701730192, 0.892...",0.500000,0.461538,0.545455,1
3516,1000,102,0.117,257,1,1,0.814144,0.992583,"[0.9698810099949823, 0.9148147724349438, 0.928...","[0.9698810099949823, 0.9575641947375921, 0.879...",0.480000,0.461538,0.500000,1
3517,1000,154,0.117,257,1,1,0.805707,0.991499,"[0.9718391232985354, 0.9028582348659189, 0.931...","[0.9718391232985354, 0.9522631090451318, 0.882...",0.480000,0.461538,0.500000,1
3518,1000,205,0.117,257,1,1,0.812903,0.988855,"[0.9655035864750009, 0.9034772835058134, 0.928...","[0.9655035864750009, 0.9510904261459586, 0.877...",0.489796,0.461538,0.521739,1


In [12]:
df.to_parquet(path_shap)