# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

from sklearn.preprocessing import OneHotEncoder
import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.model.functions import train_and_evaluate_iforest
from src.utils.functions import adjust_fi

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
# contamination_percentage = [1]
trees = [1, 5, 25, 50, 75, 100, 125, 150, 175, 200]

## Data-set

In [6]:
dataset_id = 'creditcard'
data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(284807, 29)

In [7]:
hyper

{'contamination': 0.1, 'max_samples': 256, 'n_estimators': 100}

In [8]:
excluded_cols = []

### iForest

## Parameters

In [8]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

fi_shap_all = pd.read_parquet(path_fi_shap)
fi_shap_all = adjust_fi(fi_shap_all)

## Model training with iterations HPO and FS

In [9]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, fi_df=fi_shap_all, n_tree_estimators=trees, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")

Iteration by tree number: 1
  Iteration by contamination: 0.1
    Number of featured: 2
     Iteration by feat number: 2
    Number of featured: 4
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
    Number of featured: 6
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
     Iteration by feat number: 6
    Number of featured: 9
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 5
     Iteration by feat number: 7
     Iteration by feat number: 9
    Number of featured: 11
     Iteration by feat number: 2
     Iteration by feat number: 4
     Iteration by feat number: 6
     Iteration by feat number: 8
     Iteration by feat number: 11
    Number of featured: 14
     Iteration by feat number: 2
     Iteration by feat number: 5
     Iteration by feat number: 8
     Iteration by feat number: 11
     Iteration by feat number: 14
    Number of feat

KeyboardInterrupt: 

In [None]:
df_save = df.copy()

In [None]:
df = df.drop(columns=['stab_model_list', 'stab_shap_list'])

In [None]:
df.to_parquet(path_shap)

In [None]:
def process_data(df):
    """
    Processes SHAP data files by reading them, filtering, and calculating additional metrics.
    
    Parameters:
    - dataset_id: The unique identifier for the dataset.
    - data_root: The root directory where the data files are located.
    
    Returns:
    - df: The processed pandas DataFrame containing selected variables and calculated metrics.
    """
    # Calculate percentiles and stability index
    df['shap_q1'] = df['stab_shap'].apply(lambda x: np.percentile(x, 75))
    df['shap_q2'] = df['stab_shap'].apply(lambda x: np.percentile(x, 50))
    df['shap_q3'] = df['stab_shap'].apply(lambda x: np.percentile(x, 25))
    df['stability index'] = df['stab_shap'].apply(lambda x: np.mean(x))

    return df

In [None]:
test = process_data(df)
test = test[(test.n_estimators==100) & (test.n_feats==test.n_feats.max()) & (test.max_feats==test.max_feats.max())]
test

In [None]:
import matplotlib.pyplot as plt

# Assuming `final_stability_score` is your array of stability scores from the modified function
stability_scores = np.array(test.stab_shap)[0]
stability_scores = np.sort(stability_scores)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(stability_scores, marker='o', linestyle='-', color='blue')
plt.title('Local Stability Scores per Test Instance')
plt.xlabel('Test Instance Index')
plt.ylabel('Stability Score')
plt.grid(True)
plt.show()