# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import time

from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

## Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.feature_selection.functions import shap_feature_selection, process_fi

from src.utils.functions import adjust_fi

from src.model.functions import run_model_experiment

from src.plots.functions import plots_metrics

## Parameters

In [5]:
import os
import datetime
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Assuming you have defined the necessary functions: 
# get_fs_dataset, fs_datasets_hyperparams, shap_feature_selection, process_fi, adjust_fi, run_model_experiment

# Parameters
seed = 123
iterations = 10
gamma = 0.146
n_estimators_list = [25, 50, 75, 100, 125, 150, 175, 200]
np.random.seed(seed)

# List of dataset IDs to iterate over
dataset_ids = ['arrhythmia', 'cardio', 'bank', 'creditcard', 'mammography', 'musk']  # Add more dataset IDs as needed

# Loop over each dataset
for dataset_id in dataset_ids:
    print(f"Processing dataset: {dataset_id}")

    df = get_fs_dataset(dataset_id, data_root)
    hyper = fs_datasets_hyperparams(dataset_id)

    path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
    path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

    # Split the DataFrame into features (X) and target (y)
    X = df.drop('y', axis=1)  # Features (all columns except 'y')
    y = df['y']  # Target (the 'y' column)

    # Split into training and testing sets
    xtr, xte, ytr, yte = train_test_split(X, y, test_size=0.1, random_state=seed)

    # Initialize and train the IsolationForest model
    model = IsolationForest(**hyper, random_state=seed)
    model.fit(xtr)

    feature_names = np.array(X.columns.tolist())

    # Perform SHAP feature selection
    selected_features_df = shap_feature_selection(model, xtr, xte, feature_names, agnostic=False)
    fi_shap_all = process_fi(selected_features_df, 10)
    fi_shap_all.to_parquet(path_fi_shap)

    fi_shap_all = pd.read_parquet(path_fi_shap)
    fi_shap_all = adjust_fi(fi_shap_all)

    # Capture the start time
    start_time = datetime.datetime.now()

    # Run the model experiment
    results = run_model_experiment(fi_shap_all, df, hyper, 
                                   gamma=gamma, iterations=iterations, 
                                   n_estimators_list=n_estimators_list, seed=seed, 
                                   dataset_id=dataset_id)

    # Capture the finish time
    finish_time = datetime.datetime.now()

    # Calculate the duration
    duration = finish_time - start_time

    print(f"Duration for {dataset_id}: {duration}")

    # Save the results to a parquet file
    results.to_parquet(path_shap)

    print(f"Completed processing for dataset: {dataset_id}\n")