# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.feature_selection.functions import fs_iforest_with_shap, shap_ranks, process_fi

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8] 
trees = [25, 50, 100]
group = 0

## Arrhythmia

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [6]:
dataset_id = 'arrhythmia'
data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(452, 275)

In [7]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,386
1,66


In [8]:
excluded_cols = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141','Col147', 'Col152', 'Col153', 'Col160', 'Col200', 'Col260', 'Col270']

### iForest

## Parameters

In [11]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")

## Feature selection by SHAP

In [12]:
fs_shap, fi_shap, _ = fs_iforest_with_shap(data, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs)



In [13]:
fi_shap_all = process_fi(fi_shap, 10)

In [14]:
fi_shap_all

Unnamed: 0,n_feats,cum_value,cum_value_percentage,n_feats_percentage,feat_selected
0,2,0.270339,10.370627,7.142857,"[V4, V9]"
1,4,0.519524,19.929758,14.285714,"[V4, V9, V6, V26]"
2,6,0.756666,29.026903,21.428571,"[V4, V9, V6, V26, V19, V13]"
3,9,1.081206,41.476748,32.142857,"[V4, V9, V6, V26, V19, V13, V3, V25, V16]"
4,11,1.287827,49.403074,39.285714,"[V4, V9, V6, V26, V19, V13, V3, V25, V16, V5, ..."
5,14,1.575716,60.446941,50.0,"[V4, V9, V6, V26, V19, V13, V3, V25, V16, V5, ..."
6,17,1.845828,70.808864,60.714286,"[V4, V9, V6, V26, V19, V13, V3, V25, V16, V5, ..."
7,20,2.082343,79.881979,71.428571,"[V4, V9, V6, V26, V19, V13, V3, V25, V16, V5, ..."
8,24,2.367412,90.817666,85.714286,"[V4, V9, V6, V26, V19, V13, V3, V25, V16, V5, ..."
9,28,2.606775,100.0,100.0,"[V4, V9, V6, V26, V19, V13, V3, V25, V16, V5, ..."


In [15]:
fi_shap_all.to_parquet(path_fi_shap)