# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

from os.path import join
import json
import datetime

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

### Parameters

In [4]:
save_path = os.path.join(data_root, 'outputs', 'resume.pq')

# Main script
seed = 123
np.random.seed(seed)

# List of dataset IDs to iterate over
dataset_ids = ['arrhythmia', 'cardio', 'bank', 'creditcard', 'mammography', 'musk']  # Add more dataset IDs as needed

# Initialize an empty list to store DataFrames
all_results = []

# Loop over each dataset
for dataset_id in dataset_ids:

    # Define paths for the current dataset
    path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
    path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

    # Read the data
    results = pd.read_parquet(path_shap)
    
    # Add a column to identify the dataset if necessary
    results['dataset_id'] = dataset_id
    
    # Append the DataFrame to the list
    all_results.append(results)

# Concatenate all DataFrames in the list into a single DataFrame
combined_results = pd.concat(all_results, ignore_index=True)

def convert_to_list(smli_str):
    """Safely convert a string representation of a list to an actual list."""
    try:
        return ast.literal_eval(smli_str) if isinstance(smli_str, str) else smli_str
    except (ValueError, SyntaxError) as e:
        return np.nan

# Convert the smli_all column from strings to actual lists, if needed
combined_results['smli_all'] = combined_results['smli_all'].apply(convert_to_list)

def calculate_iqr(smli_list):
    """Calculate the IQR for a list or numpy array."""
    if isinstance(smli_list, (list, np.ndarray)) and len(smli_list) > 0:
        # Convert numpy array to list if necessary
        if isinstance(smli_list, np.ndarray):
            smli_list = smli_list.tolist()
        
        q75, q25 = np.percentile(smli_list, [75, 25])
        return q75 - q25
    else:
        if not isinstance(smli_list, (list, np.ndarray)):
            print(f"Unexpected type: {type(smli_list)} with value: {smli_list}")
        elif len(smli_list) == 0:
            print("Empty list encountered.")
        return np.nan

# Apply the function to calculate the IQR
combined_results['smli_all_iqr'] = combined_results['smli_all'].apply(calculate_iqr)

# Filter the DataFrame for rows where n_estimators is 100
filtered_results = combined_results[combined_results['n_estimators'] == 100]

# Group by dataset_id and select the row with the maximum n_feat for each group
baseline_rows = filtered_results.loc[filtered_results.groupby('dataset_id')['n_feat'].idxmax()]

# Add a new column 'hpo' with the value 'Benchmark'
baseline_rows['hpo'] = 'Benchmark'

# Merge to keep only rows where smli, precision, recall, roc_auc, and f1-score are >= baseline values for each dataset_id
filtered_combined_results = pd.merge(
    combined_results,
    baseline_rows[['dataset_id', 'smli', 'precision', 'recall', 'roc_auc', 'f1-score']],
    on='dataset_id',
    suffixes=('', '_baseline')
)

# Apply the filters: smli >= smli_baseline, precision >= precision_baseline, recall >= recall_baseline, etc.
filtered_combined_results = filtered_combined_results[
    (filtered_combined_results['smli'] >= filtered_combined_results['smli_baseline']) &
    (filtered_combined_results['precision'] >= filtered_combined_results['precision_baseline']) &
    (filtered_combined_results['recall'] >= filtered_combined_results['recall_baseline']) &
    (filtered_combined_results['roc_auc'] >= filtered_combined_results['roc_auc_baseline']) &
    (filtered_combined_results['f1-score'] >= filtered_combined_results['f1-score_baseline'])
]

# Filter the DataFrame to ensure n_features_cum_shap_percentage >= 70
filtered_combined_results = filtered_combined_results[
    filtered_combined_results['n_features_cum_shap_percentage'] >= 70
]

# Sort the DataFrame by dataset_id, f1-score, precision, recall, and smli in descending order
sorted_results = filtered_combined_results.sort_values(
    by=['dataset_id', 'f1-score', 'precision', 'recall', 'smli'], 
    ascending=[True, False, False, False, False]
)

# Drop duplicates to keep only the best row per dataset_id based on the sorting criteria
best_per_dataset = sorted_results.drop_duplicates(subset=['dataset_id'], keep='first')

# Add a new column 'hpo' with the value 'Our model'
best_per_dataset['hpo'] = 'Our model'

# Find common columns between baseline_rows and best_per_dataset
common_columns = baseline_rows.columns.intersection(best_per_dataset.columns)

# Select only the common columns in both DataFrames
baseline_rows = baseline_rows[common_columns]
best_per_dataset = best_per_dataset[common_columns]

# Concatenate the baseline and best_per_dataset DataFrames
final_combined_results = pd.concat([baseline_rows, best_per_dataset], ignore_index=True)

final_combined_results.to_parquet(save_path)
# The `final_combined_results` DataFrame now contains an additional column `smli_all_iqr`
# representing the IQR of `smli_all` for each `dataset_id`.


In [5]:
final_combined_results

Unnamed: 0,dataset_id,n_feat,n_features_cum_shap_percentage,n_estimators,f1-score,recall,precision,roc_auc,smli,smli_all,smli_all_iqr,hpo
0,arrhythmia,274,100.0,100,0.387755,0.333333,0.463415,0.801186,0.504454,"[0.49454311292243597, 0.5552871876992763, 0.49...",0.017841,Benchmark
1,bank,10,100.0,100,0.145662,0.137166,0.15528,0.559574,0.875934,"[0.8694179894179894, 0.8668253968253967, 0.929...",0.07446,Benchmark
2,cardio,21,100.0,100,0.539877,0.546584,0.533333,0.931234,0.71816,"[0.7708913864032053, 0.6525560107720165, 0.623...",0.076173,Benchmark
3,creditcard,28,100.0,100,0.029915,0.884354,0.015215,0.950923,0.609007,"[0.5389711012879455, 0.6395788229000655, 0.643...",0.092561,Benchmark
4,mammography,6,100.0,100,0.195793,0.528384,0.120159,0.856543,0.861278,"[0.9629629629629631, 0.9030864197530865, 0.773...",0.075926,Benchmark
5,musk,166,100.0,100,0.487671,1.0,0.322464,0.999954,0.429596,"[0.42302926496352056, 0.4329794840379505, 0.47...",0.03749,Benchmark
6,arrhythmia,115,79.951065,200,0.489796,0.421053,0.585366,0.826572,0.540944,"[0.5202528967491977, 0.6627405260868784, 0.539...",0.077368,Our model
7,bank,6,76.862084,150,0.190561,0.178197,0.20477,0.593136,0.950273,"[1.0, 0.9382716049382717, 0.9537037037037037, ...",0.070988,Our model
8,cardio,17,89.459259,150,0.601227,0.608696,0.593939,0.949433,0.762987,"[0.7887861684430311, 0.6057517464380209, 0.639...",0.08563,Our model
9,creditcard,20,79.627301,75,0.030298,0.895692,0.01541,0.954205,0.641568,"[0.5697246722098372, 0.7434945712325742, 0.569...",0.09507,Our model
