# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [9]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

import shap
from shap_selection import feature_selection

import time

### Load enviroment variables

In [10]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [11]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.feature_selection.functions import fs_iforest_with_shap, process_fi

## General parameters

In [12]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8] 
trees = [25, 50, 100]
group = 0

## Parameters

In [13]:
seed = 123
np.random.seed(seed)

dataset_id = 'example'
df = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)

## Paths

In [14]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")

## Generate/Load data

In [15]:
# Simulated data and settings
gamma, iterations = 0.146, 10

# Split the DataFrame into features (X) and target (y)
X = df.drop('y', axis=1)  # Features (all columns except 'y')
y = df['y']  # Target (the 'y' column)

# Split into training and testing sets
xtr, xte , ytr, yte = train_test_split(X, y, test_size=0.1, random_state=seed)

## Generate the model

In [16]:
# Initialize and train the IsolationForest model
model = IsolationForest(**hyper, random_state=seed)
model.fit(xtr)

## Features names

In [17]:
feature_names = np.array(X.columns.tolist())
feature_names

array(['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5'],
      dtype='<U9')

## Feature selection by SHAP

In [18]:
def shap_feature_selection(model, X_train, X_test, feature_names, agnostic=False):
    """
    Perform feature selection using SHAP values and return a DataFrame with selected features
    and their corresponding SHAP values, percentages, and cumulative values.
    
    Parameters:
    - model: Trained model (e.g., RandomForest, XGBoost).
    - X_train: Training dataset.
    - X_test: Testing dataset.
    - feature_names: List of feature names.
    - agnostic: Whether to use model-agnostic SHAP (e.g., KernelExplainer).
    
    Returns:
    - df: DataFrame with selected features, SHAP values, and calculated statistics.
    """
    
    # Calculate SHAP values
    if agnostic:
        explainer = shap.KernelExplainer(model.predict, X_train)
    else:
        explainer = shap.TreeExplainer(model)
    
    shap_values = explainer.shap_values(X_test)[1]  # Assuming binary classification; use [0] for negative class
    
    # Compute mean absolute SHAP values for each feature
    mean_shap_values = np.mean(np.abs(shap_values), axis=0)
    
    # Create a DataFrame
    df = pd.DataFrame({
        'feature': feature_names,
        'value': mean_shap_values
    })
    
    # Calculate per_value (percentage of each value in the total)
    total_value = df['value'].sum()
    df['per_value'] = df['value'] / total_value * 100
    
    # Calculate cumulative values and cumulative percentages
    df['cum_value'] = df['value'].cumsum()
    df['cum_value_percentage'] = df['per_value'].cumsum()
    
    # Sorting by value in descending order
    df = df.sort_values(by='value', ascending=False).reset_index(drop=True)
    
    return df

In [20]:
# Example usage:
selected_features_df = shap_feature_selection(model, xtr, xte, feature_names, agnostic=False)

In [21]:
selected_features_df

Unnamed: 0,feature,value,per_value,cum_value,cum_value_percentage
0,feature_1,0.270965,20.0,0.270965,20.0
1,feature_2,0.270965,20.0,0.54193,40.0
2,feature_3,0.270965,20.0,0.812895,60.0
3,feature_4,0.270965,20.0,1.083861,80.0
4,feature_5,0.270965,20.0,1.354826,100.0


In [22]:
fi_shap_all = process_fi(selected_features_df, 10)

In [23]:
fi_shap_all

Unnamed: 0,n_feats,cum_value,cum_value_percentage,n_feats_percentage,feat_selected
0,1,0.270965,20.0,20.0,[feature_1]
1,2,0.54193,40.0,40.0,"[feature_1, feature_2]"
2,3,0.812895,60.0,60.0,"[feature_1, feature_2, feature_3]"
3,4,1.083861,80.0,80.0,"[feature_1, feature_2, feature_3, feature_4]"
4,5,1.354826,100.0,100.0,"[feature_1, feature_2, feature_3, feature_4, f..."
5,5,1.354826,100.0,100.0,"[feature_1, feature_2, feature_3, feature_4, f..."
