# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

from sklearn.preprocessing import OneHotEncoder
import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.model.functions import train_and_evaluate_iforest
from src.utils.functions import adjust_fi

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [1]
#trees = [1, 5, 25, 50, 75, 100, 125, 150, 175, 200]

trees = [1, 5, 25, 50, 75, 100, 125, 150, 175, 200]

## Data-set

In [6]:
dataset_id = 'mammography'
data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(11183, 7)

In [7]:
excluded_cols = []

### iForest

## Parameters

In [8]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

fi_shap_all = pd.read_parquet(path_fi_shap)
fi_shap_all = adjust_fi(fi_shap_all)

## Model training with iterations HPO and FS

In [9]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, fi_df=fi_shap_all, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")

Iteration by tree number: 1
  Iteration by contamination: 0.1
    Number of featured: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
    Number of featured: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
    Number of featured: 3
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
    Number of featured: 4
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
    Number of featured: 5
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
     Iteration b

In [10]:
df_save = df.copy()

In [11]:
df = df.drop(columns=['model_stab_list', 'shap_stab_list'])

In [12]:
df.to_parquet(path_shap)