# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.model.functions import train_and_evaluate_iforest

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [1] 
trees = [2]

## Data-set

In [6]:
dataset_id = 'bank'

data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(41188, 11)

In [7]:
excluded_cols = []

### iForest

## Parameters

In [8]:
# path
path = os.path.join(data_root, "outputs", f"{dataset_id}_results_if.parquet")
path

'/Users/allianz/workspace_github_pers/ad_shap_stability/test/data/outputs/bank_results_if.parquet'

In [9]:
hyper = fs_datasets_hyperparams(dataset_id)
hyper

{'contamination': 0.1, 'max_samples': 256, 'n_estimators': 100}

### Iforest full features

In [10]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")


Iteration by tree number: 2
  Iteration by contamination: 0.1
    Number of featured: 10
     Iteration by feat number: 2
     Iteration by feat number: 4
     Iteration by feat number: 6
     Iteration by feat number: 8
     Iteration by feat number: 10
Duration: 0:02:57.476933


In [11]:
def process_data(df):
    """
    Processes SHAP data files by reading them, filtering, and calculating additional metrics.
    
    Parameters:
    - dataset_id: The unique identifier for the dataset.
    - data_root: The root directory where the data files are located.
    
    Returns:
    - df: The processed pandas DataFrame containing selected variables and calculated metrics.
    """
    # Filter dataframe
    
    # Calculate percentiles and stability index
    df['shap_q1'] = df['shap_stab'].apply(lambda x: np.percentile(x, 75))
    df['shap_q2'] = df['shap_stab'].apply(lambda x: np.percentile(x, 50))
    df['shap_q3'] = df['shap_stab'].apply(lambda x: np.percentile(x, 25))
    df['shap_ad_q2'] = df['shap_stab_ad'].apply(lambda x: np.percentile(x, 50))
    df['stability index'] = df['shap_stab'].apply(lambda x: np.mean(x))

    return df

In [12]:
process_data(df)

Unnamed: 0,n_estimators,max_feats,contamination,n_feats,n_iter,n_iter_fs,roc_auc,model_stab,model_stab_list,shap_stab,...,shap_stab_ad,f1_median,recall,precision,confusion_matrix,shap_q1,shap_q2,shap_q3,shap_ad_q2,stability index
0,2,2,0.1,10,1,1,0.553965,0.867608,"[[0.9999282680061953, 0.9999999999999928, 0.71...","[0.07278852933396074, 0.14232595459664987, 0.0...",...,"[0.19999999999999996, 0.09999999999999998, 0.1...",0.275108,0.223599,0.35745,1,0.124121,0.083391,0.048414,0.1,0.104733
1,2,4,0.1,10,1,1,0.593474,0.827414,"[[0.31199874755545154, 0.999999999999992, 0.45...","[0.23103177607093284, 0.09999999999999998, 0.1...",...,"[0.1029182774491908, 0.09999999999999998, 0.10...",0.193999,0.179418,0.21116,1,0.183916,0.136344,0.098977,0.1,0.144317
2,2,6,0.1,10,1,1,0.57081,0.802916,"[[0.7903393839999806, 0.6968747807856792, 0.50...","[0.27265926483117, 0.08723892241638287, 0.1224...",...,"[0.026188113286574555, 0.006155326160975916, 0...",0.158479,0.134698,0.192456,1,0.21916,0.182491,0.14749,0.011093,0.183998
3,2,8,0.1,10,1,1,0.529411,0.748561,"[[0.5608340163197629, 0.9999988479795099, 0.99...","[0.19794988623972443, 0.13720241628757235, 0.1...",...,"[0.027633474572207772, 0.0, 0.0163629983487917...",0.125296,0.114224,0.138743,1,0.19642,0.164459,0.135161,0.008685,0.165754
4,2,10,0.1,10,1,1,0.559986,0.760318,"[[0.9531600531367082, 0.999950560681258, 0.965...","[0.12563546820952853, 0.1089580316237686, 0.17...",...,"[0.010020334435981049, 0.008605189511843081, 0...",0.144471,0.130927,0.161141,1,0.178496,0.143974,0.107276,0.009011,0.14464


In [13]:
df

Unnamed: 0,n_estimators,max_feats,contamination,n_feats,n_iter,n_iter_fs,roc_auc,model_stab,model_stab_list,shap_stab,...,shap_stab_ad,f1_median,recall,precision,confusion_matrix,shap_q1,shap_q2,shap_q3,shap_ad_q2,stability index
0,2,2,0.1,10,1,1,0.553965,0.867608,"[[0.9999282680061953, 0.9999999999999928, 0.71...","[0.07278852933396074, 0.14232595459664987, 0.0...",...,"[0.19999999999999996, 0.09999999999999998, 0.1...",0.275108,0.223599,0.35745,1,0.124121,0.083391,0.048414,0.1,0.104733
1,2,4,0.1,10,1,1,0.593474,0.827414,"[[0.31199874755545154, 0.999999999999992, 0.45...","[0.23103177607093284, 0.09999999999999998, 0.1...",...,"[0.1029182774491908, 0.09999999999999998, 0.10...",0.193999,0.179418,0.21116,1,0.183916,0.136344,0.098977,0.1,0.144317
2,2,6,0.1,10,1,1,0.57081,0.802916,"[[0.7903393839999806, 0.6968747807856792, 0.50...","[0.27265926483117, 0.08723892241638287, 0.1224...",...,"[0.026188113286574555, 0.006155326160975916, 0...",0.158479,0.134698,0.192456,1,0.21916,0.182491,0.14749,0.011093,0.183998
3,2,8,0.1,10,1,1,0.529411,0.748561,"[[0.5608340163197629, 0.9999988479795099, 0.99...","[0.19794988623972443, 0.13720241628757235, 0.1...",...,"[0.027633474572207772, 0.0, 0.0163629983487917...",0.125296,0.114224,0.138743,1,0.19642,0.164459,0.135161,0.008685,0.165754
4,2,10,0.1,10,1,1,0.559986,0.760318,"[[0.9531600531367082, 0.999950560681258, 0.965...","[0.12563546820952853, 0.1089580316237686, 0.17...",...,"[0.010020334435981049, 0.008605189511843081, 0...",0.144471,0.130927,0.161141,1,0.178496,0.143974,0.107276,0.009011,0.14464
