# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

### Load enviroment variables

In [2]:
import os
import sys
from dotenv import load_dotenv

load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

## General libraries

In [3]:
import pandas as pd
import numpy as np
from os.path import join
import json
import datetime
import shap

from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

import time

## Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.stability.functions import local_stability_measure

## Example local stability measure

### Parameters

In [5]:
# Set a seed for reproducibility
seed = 123
np.random.seed(seed)

dataset_id = 'example'
df = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)

In [6]:
hyper

{'contamination': 0.1, 'max_samples': 'auto', 'n_estimators': 100}

In [7]:
df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,y
0,-1.085631,0.997345,0.282978,-1.506295,-0.5786,1
1,1.651437,-2.426679,-0.428913,1.265936,-0.86674,1
2,-0.678886,-0.094709,1.49139,-0.638902,-0.443982,0
3,-0.434351,2.20593,2.186786,1.004054,0.386186,1
4,0.737369,1.490732,-0.935834,1.175829,-1.253881,0


### Generate data

In [8]:
# Simulated data and settings
gamma, iterations = 0.146, 10

# Split the DataFrame into features (X) and target (y)
X = df.drop('y', axis=1)  # Features (all columns except 'y')
y = df['y']  # Target (the 'y' column)

# Split into training and testing sets
xtr, xte , ytr, yte = train_test_split(X, y, test_size=0.1, random_state=seed)

### Generate Model

In [9]:
# Initialize and train the IsolationForest model
model = IsolationForest(**hyper, random_state=seed)
model.fit(xtr)

### Traditional Metrics

In [10]:
# Prediction
y_pred = model.predict(xtr)
y_scores = -model.score_samples(xtr)
y_decision = -model.decision_function(xtr)

# Create a new DataFrame with the same index as xtr
df_predictions = pd.DataFrame({
    'y_pred': y_pred,
    'y_scores': y_scores,
    'y_decision': y_decision,
    'y_real': ytr
}, index=xtr.index)

df_predictions['y_pred'] = df_predictions['y_pred'].apply(lambda x: 1 if x == -1 else 0)

In [11]:
df_predictions.head()

Unnamed: 0,y_pred,y_scores,y_decision,y_real
23,1,0.529899,0.007683,0
65,0,0.446381,-0.075835,1
76,0,0.428388,-0.093828,0
60,0,0.400126,-0.122089,1
24,0,0.447465,-0.074751,1


In [12]:
roc_auc = roc_auc_score(df_predictions.y_real, df_predictions.y_decision)

In [13]:
report = classification_report(df_predictions.y_real, df_predictions.y_pred, target_names=['Normal', 'Anomaly'], output_dict=True)

### Local stability index

In [14]:
# Call your stability measure function
local_scores, local_scores_list, ranking = local_stability_measure(
    xtr,
    xte,
    model,
    gamma,
    iterations=iterations,
    psi=0.8,
    beta_flavor=2,  # pick from: 1, 2
    subset_low=0.25,
    subset_high=0.75,
    rank_method=True
)

In [15]:
local_scores

array([0.89814815, 0.90481481, 0.83333333, 0.80703704, 0.69259259,
       0.89296296, 0.85037037, 0.70407407, 0.78518519, 0.67888889])

In [16]:
local_scores_list

array([[0.85185185, 1.        , 0.83333333, 0.86111111, 0.94444444],
       [0.88888889, 0.94444444, 0.82962963, 0.88888889, 0.97222222],
       [0.83333333, 0.68518519, 0.72222222, 0.92592593, 1.        ],
       [0.92592593, 0.88888889, 0.83333333, 0.80555556, 0.58148148],
       [0.74074074, 0.7962963 , 0.65740741, 0.37222222, 0.8962963 ],
       [0.81481481, 0.78888889, 1.        , 0.97222222, 0.88888889],
       [0.77777778, 0.75      , 0.89814815, 0.94444444, 0.88148148],
       [0.94444444, 0.81481481, 0.69444444, 0.84444444, 0.22222222],
       [0.83333333, 1.        , 0.61111111, 0.77777778, 0.7037037 ],
       [0.55555556, 0.87407407, 0.64814815, 0.56111111, 0.75555556]])

In [17]:
metrics_dict = {
    'f1-score': report['Anomaly']['f1-score'],
    'recall': report['Anomaly']['recall'],
    'precision': report['Anomaly']['precision'],
    'roc_auc': roc_auc,
    'smli': local_scores.mean(),
}

In [18]:
metrics_dict

{'f1-score': 0.24,
 'recall': 0.14634146341463414,
 'precision': 0.6666666666666666,
 'roc_auc': 0.5156794425087108,
 'smli': 0.8047407407407408}