# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

from sklearn.preprocessing import OneHotEncoder
import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.model.functions import train_and_evaluate_iforest

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8] 
trees = [25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300]

## Arrhythmia

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [6]:
dataset_id = 'arrhythmia'

data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(452, 275)

In [7]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,386
1,66


In [8]:
excluded_cols = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141','Col147', 'Col152', 'Col153', 'Col160', 'Col200', 'Col260', 'Col270']

### iForest

## Parameters

In [9]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

fi_shap_all = pd.read_parquet(path_fi_shap)

## Model training with iterations HPO and FS

In [10]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, fi_df=fi_shap_all, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)
df['shap_q1'] = df['shap_iforest_stab_unif_median'].apply(lambda x: np.percentile(x, 75))
df['shap_q2'] = df['shap_iforest_stab_unif_median'].apply(lambda x: np.percentile(x, 50))
df['shap_q3'] = df['shap_iforest_stab_unif_median'].apply(lambda x: np.percentile(x, 25))

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")

Iteration by tree number: 25
  Iteration by contamination: 0.117
    Number of featured: 8
    Number of featured: 18
    Number of featured: 29
    Number of featured: 42
    Number of featured: 57
    Number of featured: 73
    Number of featured: 94
    Number of featured: 119
    Number of featured: 152
    Number of featured: 235
    Number of featured: 257
Iteration by tree number: 50
  Iteration by contamination: 0.117
    Number of featured: 8
    Number of featured: 18
    Number of featured: 29
    Number of featured: 42
    Number of featured: 57
    Number of featured: 73
    Number of featured: 94
    Number of featured: 119
    Number of featured: 152
    Number of featured: 235
    Number of featured: 257
Iteration by tree number: 75
  Iteration by contamination: 0.117
    Number of featured: 8
    Number of featured: 18
    Number of featured: 29
    Number of featured: 42
    Number of featured: 57
    Number of featured: 73
    Number of featured: 94
    Number of fea

In [13]:
df

Unnamed: 0,n_estimators,contamination,n_feats,n_iter,n_iter_fs,roc_auc,iforest_stab_unif_median,shap_iforest_stab_unif_median,f1_median,recall_median,precision_median,confusion_matrix,shap_q1,shap_q2,shap_q3
0,25,0.117,8,1,1,0.621251,0.917350,"[0.9999999999999805, 0.9999999999999848, 0.999...",0.151261,0.136364,0.169811,1,1.000000,1.000000,1.000000
1,25,0.117,18,1,1,0.717028,0.916932,"[0.9999999986817016, 0.999999999296087, 0.9999...",0.369748,0.333333,0.415094,1,1.000000,1.000000,1.000000
2,25,0.117,29,1,1,0.713103,0.911153,"[0.9999996226681006, 0.9999993368368496, 0.999...",0.319328,0.287879,0.358491,1,1.000000,1.000000,1.000000
3,25,0.117,42,1,1,0.715693,0.872751,"[0.9999150228441998, 0.9998825200050224, 0.999...",0.436975,0.393939,0.490566,1,0.999955,0.999943,0.999924
4,25,0.117,57,1,1,0.713338,0.863626,"[0.9958795401265791, 0.9952391376168573, 0.997...",0.352941,0.318182,0.396226,1,0.997155,0.996312,0.995609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,300,0.117,94,1,1,0.816101,0.978145,"[0.7964006010204515, 0.7386295946236574, 0.914...",0.487395,0.439394,0.547170,1,0.913806,0.877995,0.818507
128,300,0.117,119,1,1,0.802402,0.976890,"[0.46139150380791116, 0.4244945152678682, 0.67...",0.470588,0.424242,0.528302,1,0.709504,0.631110,0.554896
129,300,0.117,152,1,1,0.796083,0.972388,"[0.3185334138851963, 0.30466471531105077, 0.45...",0.453782,0.409091,0.509434,1,0.500242,0.432970,0.366807
130,300,0.117,235,1,1,0.810999,0.971362,"[0.41035692773767674, 0.3559083916753244, 0.44...",0.487395,0.439394,0.547170,1,0.444501,0.413415,0.391239


In [14]:
df.to_parquet(path_shap)