# iForest

In [None]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

from sklearn.preprocessing import OneHotEncoder
import time

### Load enviroment variables

In [None]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [None]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

## General parameters

In [None]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8] 
trees = [25, 50, 100]

## Arrhythmia

**Dataset source**: http://odds.cs.stonybrook.edu/arrhythmia-dataset/ (data is transformed from .mat to .csv format)

Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.

**Additional sources**:

Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou. “Isolation forest.” 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

K. M. Ting, J. T. S. Chuan, and F. T. Liu. “Mass: A New Ranking Measure for Anomaly Detection.“, IEEE Transactions on Knowledge and Data Engineering, 2009.

F. Keller, E. Muller, K. Bohm.“HiCS: High-contrast subspaces for density-based outlier ranking.” ICDE, 2012.

In [None]:
dataset_id = 'arrhythmia'

data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

In [None]:
pd.pivot_table(data,
             values = 'Col1',
               index = 'y', 
              aggfunc = 'count')

In [None]:
excluded_cols = ['Col15', 'Col63', 'Col65', 'Col79', 'Col127', 'Col128','Col135', 'Col137', 'Col139','Col141','Col147', 'Col152', 'Col153', 'Col160', 'Col200', 'Col260', 'Col270']

### iForest

## Parameters

In [None]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

fi_shap_all = pd.read_parquet(path_fi_shap)
df = pd.read_parquet(path_shap)
df.head()

## Visualization of the results

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from scipy.interpolate import griddata

x_label = 'n_estimators'
y_label = 'n_feats'
z_label = 'precision_median'

# Assuming df is your DataFrame and it's already defined
x = df[x_label].values
y = df[y_label].values
z = df[z_label].values

# Creating grid data
xi = np.linspace(0, 300, 100)  # Adjusted to explicitly range from 0 to 300
yi = np.linspace(y.min(), y.max(), 100)
xi, yi = np.meshgrid(xi, yi)

# Interpolating z values on the grid
zi = griddata((x, y), z, (xi, yi), method='linear')

# Creating figure
fig = plt.figure()

# Adding a subplot
ax = fig.add_subplot(111, projection='3d')

# Plotting surface plot in black and white
surf = ax.plot_surface(xi, yi, zi, cmap='Greys', edgecolor='none')

# Adding labels
ax.set_xlabel(f'# {x_label}')
ax.set_ylabel(f'# {y_label}')
ax.set_zlabel(f'% {z_label}')

# Set the X-axis ticks
ax.set_xticks(np.arange(0, 301, 50))  # Adjusted for clarity

# Adding a color bar which maps values to colors, and moving it more to the right
cbar = fig.colorbar(surf, shrink=0.5, aspect=5, pad=0.1)  # Increase the pad value as needed

# Displaying the plot
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib widget

In [None]:
# Generating random data
np.random.seed(42)
x = np.random.randint(0, 100, size=100)
y = np.random.randint(0, 100, size=100)
z = np.random.randint(0, 100, size=100)

# Creating a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c='r', marker='o')

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')

plt.show()

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()

clf = IsolationForest(max_samples = 256, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'class'])

end = time.process_time()
bank_iforest_train_time = end - start
print(end - start)

start = time.process_time()

y_pred = clf.predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
bank_iforest_test_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
bank_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
bank_iforest_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
print(bank_iforest_report['1']['precision'])
print(bank_iforest_report['1']['recall'])
print(bank_iforest_report['1']['f1-score'])

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
bank_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(bank_iforest_auc_precision_recall)