# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import time

from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

## Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.utils.functions import adjust_fi

from src.model.functions import run_model_experiment

## Parameters

In [5]:
seed = 123
iterations = 10
gamma = 0.146
n_estimators_list = [1, 5, 25, 50, 75, 100, 125, 150, 175, 200]
np.random.seed(seed)

dataset_id = 'example'
df = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)

### Paths

In [6]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

### Features selected

In [7]:
fi_shap_all = pd.read_parquet(path_fi_shap)
fi_shap_all = adjust_fi(fi_shap_all)
fi_shap_all

Unnamed: 0,n_feats,cum_value,cum_value_percentage,n_feats_percentage,feat_selected
0,2,0.817763,49.580866,40.0,"[feature_2, feature_4]"
2,3,1.151545,69.817997,60.0,"[feature_2, feature_4, feature_3]"
3,4,1.438979,87.245099,80.0,"[feature_2, feature_4, feature_3, feature_1]"
4,5,1.649352,100.0,100.0,"[feature_2, feature_4, feature_3, feature_1, f..."


### Generate data

In [8]:
# Capture the start time
start_time = datetime.datetime.now()

results = run_model_experiment(fi_shap_all, df, hyper, 
                     gamma=gamma, iterations=iterations, 
                     n_estimators_list=n_estimators_list, seed=seed, 
                     dataset_id=dataset_id)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")

Duration: 0:01:10.791910


In [9]:
results.to_parquet(path_shap)

In [10]:
results.sample(3)

Unnamed: 0,dataset_id,n_feat,n_features_cum_shap_percentage,n_estimators,f1-score,recall,precision,roc_auc,smli,smli_all
0,example,2,49.580866,1,0.170213,0.097561,0.666667,0.589597,0.819444,"[1.0, 0.8333333333333333, 0.7222222222222222, ..."
19,example,3,69.817997,200,0.16,0.097561,0.444444,0.52215,0.861111,"[0.9444444444444443, 0.7716049382716049, 0.722..."
32,example,5,100.0,25,0.24,0.146341,0.666667,0.481334,0.719741,"[0.6759259259259259, 0.515925925925926, 0.8407..."
