# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.feature_selection.functions import fs_iforest_with_shap, shap_ranks, process_fi

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8] 
trees = [25, 50, 100]
group = 0

# Function to calculate median of a list
def calculate_median(numbers_list):
    return np.median(numbers_list)

# Function to calculate mean of a list
def calculate_mean(numbers_list):
    return np.mean(numbers_list)

# Define aggregation criteria for each variable
aggregation_rules = {
    'n_iter': 'max',
    'n_iter_fs': 'max',
    'f1_median': 'mean',
    'recall_median': 'mean',
    'precision_median': 'mean',
    'roc_auc': 'mean',
    'iforest_stab_unif_median': 'median',
    'shap_stab_median': 'median',
    'shap_stab_mean': 'mean',
}

## Arrhythmia

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [6]:
dataset_id = 'arrhythmia'

In [7]:
data = get_fs_dataset(dataset_id, data_root)

In [8]:
hyper = fs_datasets_hyperparams(dataset_id)

In [9]:
data.shape

(452, 275)

In [10]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,386
1,66


In [11]:
excluded_cols = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141','Col147', 'Col152', 'Col153', 'Col160', 'Col200', 'Col260', 'Col270']

### iForest

## Parameters

In [12]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")

## Feature selection by SHAP

In [13]:
fs_shap, fi_shap, _ = fs_iforest_with_shap(data, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs)

In [14]:
fi_shap_all = process_fi(fi_shap, 10)

In [15]:
fi_shap_all

Unnamed: 0,n_feats,cum_value,cum_value_percentage,n_feats_percentage,feat_selected
0,8,0.2904,9.930181,3.404255,"[Col88, Col216, Col197, Col177, Col106, Col40,..."
1,18,0.592406,20.257248,7.659574,"[Col88, Col216, Col197, Col177, Col106, Col40,..."
2,29,0.888173,30.370954,12.340426,"[Col88, Col216, Col197, Col177, Col106, Col40,..."
3,42,1.179028,40.316713,17.87234,"[Col88, Col216, Col197, Col177, Col106, Col40,..."
4,57,1.471199,50.307466,24.255319,"[Col88, Col216, Col197, Col177, Col106, Col40,..."
5,73,1.746819,59.732258,31.06383,"[Col88, Col216, Col197, Col177, Col106, Col40,..."
6,94,2.05202,70.168542,40.0,"[Col88, Col216, Col197, Col177, Col106, Col40,..."
7,119,2.33564,79.866889,50.638298,"[Col88, Col216, Col197, Col177, Col106, Col40,..."
8,152,2.628842,89.892909,64.680851,"[Col88, Col216, Col197, Col177, Col106, Col40,..."
9,235,2.924416,100.0,100.0,"[Col88, Col216, Col197, Col177, Col106, Col40,..."


In [16]:
fi_shap_all.to_parquet(path_fi_shap)