# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [6]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import time

### Load enviroment variables

In [7]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [8]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.model.functions import train_and_evaluate_iforest

## General parameters

In [10]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [1] 
trees = [2, 3, 4]

## Bank

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [11]:
dataset_id = 'bank'

data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

In [15]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,386
1,66


In [1]:
excluded_cols = []

### iForest

## Parameters

In [17]:
# path
path = os.path.join(data_root, "outputs", f"{dataset_id}_results_if.parquet")
path

'/Users/allianz/workspace_github_pers/ad_shap_stability/test/data/outputs/arrhythmia_results_if.parquet'

In [18]:
hyper = fs_datasets_hyperparams(dataset_id)
hyper

{'contamination': 0.146, 'max_samples': 256, 'n_estimators': 100}

### Iforest full features

In [75]:
data = data[['y', 'Col1', 'Col2', 'Col3', 'Col4']]

In [76]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")


Iteration by tree number: 2
  Iteration by contamination: 0.146
    Number of featured: 4
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
Iteration by tree number: 3
  Iteration by contamination: 0.146
    Number of featured: 4
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
Iteration by tree number: 4
  Iteration by contamination: 0.146
    Number of featured: 4
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
Duration: 0:00:03.344778


In [77]:
df.head()

Unnamed: 0,n_estimators,max_feats,contamination,n_feats,n_iter,n_iter_fs,roc_auc,model_stab,model_stab_list,shap_stab,shap_stab_list,shap_stab_ad,f1_median,recall,precision,confusion_matrix
0,2,2,0.146,4,1,1,0.519355,0.638759,"[[0.28648485013984515, 0.5287971752997236, 0.0...","[0.6744328805206595, 0.1942595026404511, 0.555...","[[0.00010832122541367065, 0.3255219308190927, ...","[0.3378821187171721, 0.7592386879573534, 0.419...",0.188679,0.192308,0.185185,1
1,2,2,0.146,4,1,1,0.608065,0.638759,"[[0.28648485013984515, 0.5287971752997236, 0.0...","[0.6744328805206595, 0.1942595026404511, 0.555...","[[0.00010832122541367065, 0.3255219308190927, ...","[0.3378821187171721, 0.7592386879573534, 0.419...",0.148148,0.153846,0.142857,1
2,2,2,0.146,4,1,1,0.599256,0.638759,"[[0.28648485013984515, 0.5287971752997236, 0.0...","[0.6744328805206595, 0.1942595026404511, 0.555...","[[0.00010832122541367065, 0.3255219308190927, ...","[0.3378821187171721, 0.7592386879573534, 0.419...",0.254545,0.269231,0.241379,1
3,2,3,0.146,4,1,1,0.521216,0.595163,"[[0.09058406339075939, 0.4256104572864877, 0.2...","[0.38411978080997067, 0.4917987904739043, 0.67...","[[1.0, 0.6691625556148298, 0.07899345852564296...","[0.22371469511899789, 0.6629133069950652, 0.49...",0.163265,0.153846,0.173913,1
4,2,4,0.146,4,1,1,0.423821,0.590633,"[[0.08185485435428587, 0.39496908987049817, 1....","[0.7362237047847999, 0.6172189005760176, 0.408...","[[8.84439101921636e-05, 0.3255219308190927, 0....","[0.530016687003913, 0.8134797283439593, 0.6238...",0.109091,0.115385,0.103448,1
