# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.model.functions import train_and_evaluate_iforest

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [1] 
trees = [2, 3, 4]

## Credit card

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [6]:
dataset_id = 'cardio'

data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(1831, 22)

In [7]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0.0,1655
1.0,176


In [8]:
excluded_cols = []

### iForest

## Parameters

In [9]:
# path
path = os.path.join(data_root, "outputs", f"{dataset_id}_results_if.parquet")
path

'/Users/allianz/workspace_github_pers/ad_shap_stability/test/data/outputs/cardio_results_if.parquet'

In [10]:
hyper = fs_datasets_hyperparams(dataset_id)
hyper

{'contamination': 0.1, 'max_samples': 256, 'n_estimators': 100}

### Iforest full features

In [12]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")


Iteration by tree number: 2
  Iteration by contamination: 0.1
    Number of featured: 4
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
Iteration by tree number: 3
  Iteration by contamination: 0.1
    Number of featured: 4
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
Iteration by tree number: 4
  Iteration by contamination: 0.1
    Number of featured: 4
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
Duration: 0:00:08.543103


In [13]:
df.head()

Unnamed: 0,n_estimators,max_feats,contamination,n_feats,n_iter,n_iter_fs,roc_auc,model_stab,model_stab_list,shap_stab,shap_stab_list,shap_stab_ad,f1_median,recall,precision,confusion_matrix
0,2,2,0.1,4,1,1,0.667098,0.713119,"[[0.16956872405415568, 0.9994306632244201, 0.5...","[0.36866120744841635, 0.6327583460291016, 0.22...","[[0.3580241583334105, 0.1681488332420652, 0.66...","[0.6327583460291016, 0.6918957487954589, 0.710...",0.167832,0.171429,0.164384,1
1,2,2,0.1,4,1,1,0.639086,0.713119,"[[0.16956872405415568, 0.9994306632244201, 0.5...","[0.36866120744841635, 0.6327583460291016, 0.22...","[[0.3580241583334105, 0.1681488332420652, 0.66...","[0.6327583460291016, 0.6918957487954589, 0.710...",0.111888,0.114286,0.109589,1
2,2,2,0.1,4,1,1,0.628744,0.713119,"[[0.16956872405415568, 0.9994306632244201, 0.5...","[0.36866120744841635, 0.6327583460291016, 0.22...","[[0.3580241583334105, 0.1681488332420652, 0.66...","[0.6327583460291016, 0.6918957487954589, 0.710...",0.106061,0.1,0.112903,1
3,2,3,0.1,4,1,1,0.471644,0.762555,"[[0.08882670781340307, 0.9999768225585975, 0.7...","[0.5692286953617829, 0.4407270445443692, 0.496...","[[0.9170000681138529, 0.053692003875251526, 0....","[0.4407270445443692, 0.30537526832666595, 0.69...",0.190476,0.171429,0.214286,1
4,2,4,0.1,4,1,1,0.605613,0.783385,"[[0.3004919494302173, 0.9999734845227269, 0.85...","[0.47185201402526966, 0.5650255427296373, 0.38...","[[0.30110522698804054, 0.32706800249791523, 0....","[0.5650255427296373, 0.7193382925508174, 0.799...",0.142857,0.142857,0.142857,1
