# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.model.functions import train_and_evaluate_iforest
#from src.stability.functions import stability_measure_model, stability_measure_shap

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8] 
trees = [25, 50, 100, 125, 150, 175]

# Function to calculate median of a list
def calculate_median(numbers_list):
    return np.median(numbers_list)

# Function to calculate mean of a list
def calculate_mean(numbers_list):
    return np.mean(numbers_list)

# Define aggregation criteria for each variable
aggregation_rules = {
    'n_iter': 'max',
    'n_iter_fs': 'max',
    'f1_median': 'mean',
    'recall_median': 'mean',
    'precision_median': 'mean',
    'roc_auc': 'mean',
    'iforest_stab_unif_median': 'median',
    'shap_stab_median': 'median',
    'shap_stab_mean': 'mean',
}

## Arrhythmia

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [6]:
dataset_id = 'arrhythmia'

In [7]:
data = get_fs_dataset(dataset_id, data_root)

In [8]:
hyper = fs_datasets_hyperparams(dataset_id)

In [9]:
data.shape

(452, 275)

In [10]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Col1
y,Unnamed: 1_level_1
0,386
1,66


In [11]:
excluded_cols = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141','Col147', 'Col152', 'Col153', 'Col160', 'Col200', 'Col260', 'Col270']

### iForest

## Parameters

In [12]:
# path
path = os.path.join(data_root, "outputs", f"{dataset_id}_results_if.parquet")
path

'/Users/allianz/workspace_github_pers/ad_shap_stability/test/data/outputs/arrhythmia_results_if.parquet'

In [13]:
hyper = fs_datasets_hyperparams(dataset_id)
hyper

{'contamination': 0.146, 'max_samples': 256, 'n_estimators': 100}

### Iforest full features

In [14]:
# Capture the start time
start_time = datetime.datetime.now()

results_if = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")


Iteration by tree number: 25
  Iteration by contamination: 0.117
    Number of featured: 257
Iteration by tree number: 50
  Iteration by contamination: 0.117
    Number of featured: 257
Iteration by tree number: 100
  Iteration by contamination: 0.117
    Number of featured: 257
Iteration by tree number: 125
  Iteration by contamination: 0.117
    Number of featured: 257
Iteration by tree number: 150
  Iteration by contamination: 0.117
    Number of featured: 257
Iteration by tree number: 175
  Iteration by contamination: 0.117
    Number of featured: 257
Duration: 0:00:30.340978


In [15]:
results_if

Unnamed: 0,n_estimators,contamination,n_feats,n_iter,n_iter_fs,roc_auc,iforest_stab_unif_median,shap_iforest_stab_unif_median,f1_median,recall_median,precision_median,confusion_matrix
0,25,0.117,257,1,1,0.767938,0.847883,"[0.202967246597208, 0.23245559446958364, 0.168...",0.436975,0.393939,0.490566,1
1,50,0.117,257,1,1,0.79149,0.877007,"[0.22640850100601062, 0.29579802453699255, 0.2...",0.436975,0.393939,0.490566,1
2,100,0.117,257,1,1,0.805582,0.933956,"[0.3365284417527524, 0.33454972441637554, 0.31...",0.470588,0.424242,0.528302,1
3,125,0.117,257,1,1,0.808447,0.94897,"[0.35533709871293284, 0.3592619933826803, 0.34...",0.487395,0.439394,0.54717,1
4,150,0.117,257,1,1,0.80884,0.958729,"[0.39213625359785154, 0.38499474663402944, 0.3...",0.487395,0.439394,0.54717,1
5,175,0.117,257,1,1,0.805268,0.960405,"[0.41002892990069595, 0.38374046480961255, 0.4...",0.470588,0.424242,0.528302,1


In [16]:
start = time.process_time()

clf = IsolationForest(max_samples=256, n_estimators=100)
clf.fit(train_data.loc[:, train_data.columns != 'Class'])

end = time.process_time()
creditcard_iforest_train_time = end - start
print(end - start)

start = time.process_time()

y_pred = clf.predict(train_data.loc[:, train_data.columns != 'Class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'Class'])

end = time.process_time()
creditcard_iforest_test_time = end - start
print(end - start)

NameError: name 'IsolationForest' is not defined

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['Class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['Class'], train_data['y_scores'])
creditcard_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
creditcard_iforest_report = classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
print(creditcard_iforest_report['1']['precision'])
print(creditcard_iforest_report['1']['recall'])
print(creditcard_iforest_report['1']['f1-score'])

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['Class'], train_data['y_scores'])
creditcard_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(creditcard_iforest_auc_precision_recall)

In [None]:
data = pd.read_csv('./bank.csv')

In [None]:
data.head()

In [None]:
pd.pivot_table(data,
             values = 'age',
               index = 'class', 
              aggfunc = 'count')

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()

clf = IsolationForest(max_samples = 256, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'class'])

end = time.process_time()
bank_iforest_train_time = end - start
print(end - start)

start = time.process_time()

y_pred = clf.predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
bank_iforest_test_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
bank_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
bank_iforest_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
print(bank_iforest_report['1']['precision'])
print(bank_iforest_report['1']['recall'])
print(bank_iforest_report['1']['f1-score'])

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
bank_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(bank_iforest_auc_precision_recall)