# iForest

In [4]:
import warnings
warnings.filterwarnings('ignore')

### Load enviroment variables

In [5]:
import os
import sys
from dotenv import load_dotenv

load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

## General libraries

In [6]:
import pandas as pd
import numpy as np
from os.path import join
import json
import datetime
import shap

from sklearn.ensemble import IsolationForest

import time

## Specific libraries

In [7]:
from src.stability.functions import stability_measure_shap

In [8]:
# Simulated data and settings
ntr, nte, iterations = 100, 5, 7
ft_col = 6

Xtr = np.random.randn(ntr, ft_col)  # Training data
Xte = np.random.randn(nte, ft_col)  # Test data
gamma = 0.146 # Just random criteria
model = IsolationForest()  # Anomaly detection model

stability_measure_shap(
    Xtr,
    Xte,
    model,
    gamma,
    unif=True,  # pick True or False
    iterations=iterations,
    psi=0.8,
    beta_flavor=2,  # pick from: 1, 2
    subset_low=0.25,
    subset_high=0.75,
    intermediate_scores=False,
)

(array([0.63286509, 0.54564208, 0.58186276, 0.6160031 , 0.83556306]),
 array([0.36713491, 0.45435792, 0.41813724, 0.3839969 , 0.16443694]))

### iForest

In [None]:
confusion_matrix(train_data['Class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['Class'], train_data['y_scores'])
creditcard_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
creditcard_iforest_report = classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1'], output_dict = True)
print(classification_report(train_data['Class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
print(creditcard_iforest_report['1']['precision'])
print(creditcard_iforest_report['1']['recall'])
print(creditcard_iforest_report['1']['f1-score'])

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['Class'], train_data['y_scores'])
creditcard_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(creditcard_iforest_auc_precision_recall)

In [None]:
pd.pivot_table(data,
             values = 'age',
               index = 'class', 
              aggfunc = 'count')

### iForest

In [None]:
train_data = data.copy()

In [None]:
start = time.process_time()

clf = IsolationForest(max_samples = 256, n_estimators = 100)
clf.fit(train_data.loc[:, train_data.columns != 'class'])

end = time.process_time()
bank_iforest_train_time = end - start
print(end - start)

start = time.process_time()

y_pred = clf.predict(train_data.loc[:, train_data.columns != 'class'])
y_scores = clf.score_samples(train_data.loc[:, train_data.columns != 'class'])
end = time.process_time()
bank_iforest_test_time = end - start
print(end - start)

In [None]:
train_data['y_pred'] = y_pred
train_data['prediction'] = train_data.apply(def_outlier, axis = 1)
train_data['y_scores'] = -y_scores

In [None]:
confusion_matrix(train_data['class'], train_data['prediction'])

In [None]:
fpr, tpr, _ = metrics.roc_curve(train_data['class'], train_data['y_scores'])
bank_iforest_auc = metrics.auc(fpr, tpr)
metrics.auc(fpr, tpr)

In [None]:
bank_iforest_report = classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1'], output_dict=True)
print(classification_report(train_data['class'], train_data['prediction'], target_names = ['0','1']))

In [None]:
print(bank_iforest_report['1']['precision'])
print(bank_iforest_report['1']['recall'])
print(bank_iforest_report['1']['f1-score'])

In [None]:
precision, recall, thresholds = precision_recall_curve(train_data['class'], train_data['y_scores'])
bank_iforest_auc_precision_recall = metrics.auc(recall, precision)
print(bank_iforest_auc_precision_recall)