# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import time

from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

## Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.feature_selection.functions import shap_feature_selection, process_fi

from src.utils.functions import adjust_fi

from src.model.functions import run_model_experiment

from src.plots.functions import plots_metrics

## Parameters

In [5]:
import os
import datetime
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Assuming you have defined the necessary functions: 
# get_fs_dataset, fs_datasets_hyperparams, shap_feature_selection, process_fi, adjust_fi, run_model_experiment

# Parameters
seed = 123
iterations = 10
gamma = 0.146
n_estimators_list = [25, 50, 75, 100, 125, 150, 175, 200]
np.random.seed(seed)

# List of dataset IDs to iterate over
dataset_ids = ['arrhythmia', 'cardio', 'bank', 'creditcard', 'mammography', 'musk']  # Add more dataset IDs as needed

# Loop over each dataset
for dataset_id in dataset_ids:
    print(f"Processing dataset: {dataset_id}")

    df = get_fs_dataset(dataset_id, data_root)
    hyper = fs_datasets_hyperparams(dataset_id)

    path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
    path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

    # Split the DataFrame into features (X) and target (y)
    X = df.drop('y', axis=1)  # Features (all columns except 'y')
    y = df['y']  # Target (the 'y' column)

    # Split into training and testing sets
    xtr, xte, ytr, yte = train_test_split(X, y, test_size=0.1, random_state=seed)

    # Initialize and train the IsolationForest model
    model = IsolationForest(**hyper, random_state=seed)
    model.fit(xtr)

    feature_names = np.array(X.columns.tolist())

    # Perform SHAP feature selection
    selected_features_df = shap_feature_selection(model, xtr, xte, feature_names, agnostic=False)
    fi_shap_all = process_fi(selected_features_df, 10)
    fi_shap_all.to_parquet(path_fi_shap)

    fi_shap_all = pd.read_parquet(path_fi_shap)
    fi_shap_all = adjust_fi(fi_shap_all)

    # Capture the start time
    start_time = datetime.datetime.now()

    # Run the model experiment
    results = run_model_experiment(fi_shap_all, df, hyper, 
                                   gamma=gamma, iterations=iterations, 
                                   n_estimators_list=n_estimators_list, seed=seed, 
                                   dataset_id=dataset_id)

    # Capture the finish time
    finish_time = datetime.datetime.now()

    # Calculate the duration
    duration = finish_time - start_time

    print(f"Duration for {dataset_id}: {duration}")

    # Save the results to a parquet file
    results.to_parquet(path_shap)

    print(f"Completed processing for dataset: {dataset_id}\n")

Processing dataset: arrhythmia


2024-08-29 16:36:42,874 - INFO - Starting experiment with 7 features.
2024-08-29 16:36:42,877 - INFO - Starting model training with 7 features and 25 estimators.
2024-08-29 16:36:48,865 - INFO - Starting model training with 7 features and 50 estimators.
2024-08-29 16:36:50,566 - INFO - Starting model training with 7 features and 75 estimators.
2024-08-29 16:36:53,453 - INFO - Starting model training with 7 features and 100 estimators.
2024-08-29 16:36:56,630 - INFO - Starting model training with 7 features and 125 estimators.
2024-08-29 16:37:00,563 - INFO - Starting model training with 7 features and 150 estimators.
2024-08-29 16:37:05,947 - INFO - Starting model training with 7 features and 175 estimators.
2024-08-29 16:37:11,819 - INFO - Starting model training with 7 features and 200 estimators.
2024-08-29 16:37:18,260 - INFO - Starting experiment with 15 features.
2024-08-29 16:37:18,263 - INFO - Starting model training with 15 features and 25 estimators.
2024-08-29 16:37:19,358 -

Duration for arrhythmia: 0:04:30.606962
Completed processing for dataset: arrhythmia

Processing dataset: cardio


2024-08-29 16:41:14,270 - INFO - Starting experiment with 2 features.
2024-08-29 16:41:14,272 - INFO - Starting model training with 2 features and 25 estimators.
2024-08-29 16:41:14,930 - INFO - Starting model training with 2 features and 50 estimators.
2024-08-29 16:41:16,156 - INFO - Starting model training with 2 features and 75 estimators.
2024-08-29 16:41:17,914 - INFO - Starting model training with 2 features and 100 estimators.
2024-08-29 16:41:20,268 - INFO - Starting model training with 2 features and 125 estimators.
2024-08-29 16:41:23,178 - INFO - Starting model training with 2 features and 150 estimators.
2024-08-29 16:41:26,681 - INFO - Starting model training with 2 features and 175 estimators.
2024-08-29 16:41:30,732 - INFO - Starting model training with 2 features and 200 estimators.
2024-08-29 16:41:35,326 - INFO - Starting experiment with 4 features.
2024-08-29 16:41:35,329 - INFO - Starting model training with 4 features and 25 estimators.
2024-08-29 16:41:36,266 - I

Duration for cardio: 0:05:02.740539
Completed processing for dataset: cardio

Processing dataset: bank


2024-08-29 16:46:27,227 - INFO - Starting experiment with 2 features.
2024-08-29 16:46:27,229 - INFO - Starting model training with 2 features and 25 estimators.
2024-08-29 16:46:28,369 - INFO - Starting model training with 2 features and 50 estimators.
2024-08-29 16:46:30,115 - INFO - Starting model training with 2 features and 75 estimators.
2024-08-29 16:46:32,438 - INFO - Starting model training with 2 features and 100 estimators.
2024-08-29 16:46:35,442 - INFO - Starting model training with 2 features and 125 estimators.
2024-08-29 16:46:39,082 - INFO - Starting model training with 2 features and 150 estimators.
2024-08-29 16:46:43,487 - INFO - Starting model training with 2 features and 175 estimators.
2024-08-29 16:46:48,621 - INFO - Starting model training with 2 features and 200 estimators.
2024-08-29 16:46:54,499 - INFO - Starting experiment with 3 features.
2024-08-29 16:46:54,502 - INFO - Starting model training with 3 features and 25 estimators.
2024-08-29 16:46:56,394 - I

Duration for bank: 0:25:25.103344
Completed processing for dataset: bank

Processing dataset: creditcard


2024-08-29 17:12:25,432 - INFO - Starting experiment with 2 features.
2024-08-29 17:12:25,435 - INFO - Starting model training with 2 features and 25 estimators.
2024-08-29 17:12:49,002 - INFO - Starting model training with 2 features and 50 estimators.
2024-08-29 17:13:31,400 - INFO - Starting model training with 2 features and 75 estimators.
2024-08-29 17:14:31,651 - INFO - Starting model training with 2 features and 100 estimators.
2024-08-29 17:15:52,935 - INFO - Starting model training with 2 features and 125 estimators.
2024-08-29 17:17:36,887 - INFO - Starting model training with 2 features and 150 estimators.
2024-08-29 17:19:46,075 - INFO - Starting model training with 2 features and 175 estimators.
2024-08-29 17:22:10,679 - INFO - Starting model training with 2 features and 200 estimators.
2024-08-29 17:24:57,908 - INFO - Starting experiment with 4 features.
2024-08-29 17:24:57,919 - INFO - Starting model training with 4 features and 25 estimators.
2024-08-29 17:25:32,754 - I

Duration for creditcard: 6:06:34.363796
Completed processing for dataset: creditcard

Processing dataset: mammography


2024-08-29 23:19:01,706 - INFO - Starting experiment with 2 features.
2024-08-29 23:19:01,708 - INFO - Starting model training with 2 features and 25 estimators.
2024-08-29 23:19:03,021 - INFO - Starting model training with 2 features and 50 estimators.
2024-08-29 23:19:05,457 - INFO - Starting model training with 2 features and 75 estimators.
2024-08-29 23:19:08,921 - INFO - Starting model training with 2 features and 100 estimators.
2024-08-29 23:19:13,450 - INFO - Starting model training with 2 features and 125 estimators.
2024-08-29 23:19:19,120 - INFO - Starting model training with 2 features and 150 estimators.
2024-08-29 23:19:25,955 - INFO - Starting model training with 2 features and 175 estimators.
2024-08-29 23:19:34,138 - INFO - Starting model training with 2 features and 200 estimators.
2024-08-29 23:19:43,573 - INFO - Starting experiment with 3 features.
2024-08-29 23:19:43,574 - INFO - Starting model training with 3 features and 25 estimators.
2024-08-29 23:19:44,964 - I

Duration for mammography: 0:04:13.392109
Completed processing for dataset: mammography

Processing dataset: musk


2024-08-29 23:23:16,443 - INFO - Starting experiment with 9 features.
2024-08-29 23:23:16,445 - INFO - Starting model training with 9 features and 25 estimators.
2024-08-29 23:23:17,914 - INFO - Starting model training with 9 features and 50 estimators.
2024-08-29 23:23:20,563 - INFO - Starting model training with 9 features and 75 estimators.
2024-08-29 23:23:24,386 - INFO - Starting model training with 9 features and 100 estimators.
2024-08-29 23:23:29,596 - INFO - Starting model training with 9 features and 125 estimators.
2024-08-29 23:23:36,158 - INFO - Starting model training with 9 features and 150 estimators.
2024-08-29 23:23:43,970 - INFO - Starting model training with 9 features and 175 estimators.
2024-08-29 23:23:53,192 - INFO - Starting model training with 9 features and 200 estimators.
2024-08-29 23:24:03,868 - INFO - Starting experiment with 20 features.
2024-08-29 23:24:03,870 - INFO - Starting model training with 20 features and 25 estimators.
2024-08-29 23:24:05,744 -

Duration for musk: 0:09:53.679789
Completed processing for dataset: musk

