# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.feature_selection.functions import fs_iforest_with_shap, shap_ranks, process_fi

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8] 
trees = [25, 50, 100]
group = 0

## Data set

In [6]:
dataset_id = 'mammography'
data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(11183, 7)

In [7]:
excluded_cols = []

In [8]:
data

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,y
0,0.230020,5.072578,-0.276061,0.832444,-0.377866,0.480322,0
1,0.155491,-0.169390,0.670652,-0.859553,-0.377866,-0.945723,0
2,-0.784415,-0.443654,5.674705,-0.859553,-0.377866,-0.945723,0
3,0.546088,0.131415,-0.456387,-0.859553,-0.377866,-0.945723,0
4,-0.102987,-0.394994,-0.140816,0.979703,-0.377866,1.013566,0
...,...,...,...,...,...,...,...
11178,-0.250012,-0.377300,-0.321142,1.269157,3.652984,1.092791,1
11179,0.281343,-0.417112,-0.366224,0.851010,2.789649,1.345700,1
11180,1.204988,1.763724,-0.501468,1.562408,6.489072,0.931294,1
11181,0.736644,-0.222474,-0.050653,1.509665,0.539269,1.315229,1


### iForest

## Parameters

In [9]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")

## Feature selection by SHAP

In [10]:
fs_shap, fi_shap, _ = fs_iforest_with_shap(data, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs)

In [11]:
fi_shap_all = process_fi(fi_shap, 10)

In [12]:
fi_shap_all

Unnamed: 0,n_feats,cum_value,cum_value_percentage,n_feats_percentage,feat_selected
0,1,0.909303,34.975729,16.666667,[Col2]
1,2,1.392368,53.556512,33.333333,"[Col2, Col5]"
2,3,1.755912,67.539975,50.0,"[Col2, Col5, Col3]"
3,4,2.108139,81.088183,66.666667,"[Col2, Col5, Col3, Col1]"
4,5,2.367333,91.057897,83.333333,"[Col2, Col5, Col3, Col1, Col6]"
5,6,2.599811,100.0,100.0,"[Col2, Col5, Col3, Col1, Col6, Col4]"
6,6,2.599811,100.0,100.0,"[Col2, Col5, Col3, Col1, Col6, Col4]"


In [13]:
fi_shap_all.to_parquet(path_fi_shap)