# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

from sklearn.preprocessing import OneHotEncoder
import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.model.functions import train_and_evaluate_iforest

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [1]
trees = [1, 5, 10, 15, 20, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300]

## Allianz

----
**Dataset source**: Private dataset from Allianz Benelux, which contain the claims data from clients during 201801-202312

**Additional sources**:

----

In [6]:
dataset_id = 'allianz'

data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(42358, 36)

In [7]:
hyper['contamination'] = 0.1
hyper

{'contamination': 0.1, 'max_samples': 256, 'n_estimators': 100}

In [8]:
excluded_cols = []

### iForest

## Parameters

In [14]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

fi_shap_all = pd.read_parquet(path_fi_shap)

## Model training with iterations HPO and FS

In [10]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, fi_df=fi_shap_all, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")

Iteration by tree number: 1
  Iteration by contamination: 0.1
    Number of featured: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
    Number of featured: 3
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
    Number of featured: 5
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
     Iteration by feat number: 5
    Number of featured: 7
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 4
     Iteration by feat number: 5
     Iteration by feat number: 7
    Number of featured: 9
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 5
     Iteration by feat number: 7
     Iteration b

In [11]:
df.head(2)

Unnamed: 0,n_estimators,max_feats,contamination,n_feats,n_iter,n_iter_fs,roc_auc,model_stab,model_stab_list,shap_stab,shap_stab_list,shap_stab_ad,f1_median,recall,precision,confusion_matrix
0,1,2,0.1,2,1,1,,1.0,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[0.2001977692338378, 1.0, 0.2001977692338378, ...","[[0.2001977692338378, 0.2001977692338378], [1....","[0.20019776923383792, 1.0, 0.20019776923383792...",0.058328,0.03004,1.0,1
1,1,2,0.1,2,1,1,,1.0,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[0.2001977692338378, 1.0, 0.2001977692338378, ...","[[0.2001977692338378, 0.2001977692338378], [1....","[0.20019776923383792, 1.0, 0.20019776923383792...",0.058328,0.03004,1.0,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   n_estimators      935 non-null    int64  
 1   max_feats         935 non-null    int64  
 2   contamination     935 non-null    float64
 3   n_feats           935 non-null    int64  
 4   n_iter            935 non-null    int64  
 5   n_iter_fs         935 non-null    int64  
 6   roc_auc           0 non-null      float64
 7   model_stab        935 non-null    float64
 8   model_stab_list   935 non-null    object 
 9   shap_stab         935 non-null    object 
 10  shap_stab_list    935 non-null    object 
 11  shap_stab_ad      935 non-null    object 
 12  f1_median         935 non-null    float64
 13  recall            935 non-null    float64
 14  precision         935 non-null    float64
 15  confusion_matrix  935 non-null    int64  
dtypes: float64(6), int64(6), object(4)
memory us

In [27]:
df = df_save.copy()

In [24]:
df.drop('model_stab_list', axis=1, inplace=True)
df.drop('shap_stab_list', axis=1, inplace=True)

In [25]:
df.head()

Unnamed: 0,n_estimators,max_feats,contamination,n_feats,n_iter,n_iter_fs,roc_auc,model_stab,shap_stab,shap_stab_ad,f1_median,recall,precision,confusion_matrix
0,1,2,0.1,2,1,1,,1.0,"[0.2001977692338378, 1.0, 0.2001977692338378, ...","[0.20019776923383792, 1.0, 0.20019776923383792...",0.058328,0.03004,1.0,1
1,1,2,0.1,2,1,1,,1.0,"[0.2001977692338378, 1.0, 0.2001977692338378, ...","[0.20019776923383792, 1.0, 0.20019776923383792...",0.058328,0.03004,1.0,1
2,1,2,0.1,2,1,1,,1.0,"[0.2001977692338378, 1.0, 0.2001977692338378, ...","[0.20019776923383792, 1.0, 0.20019776923383792...",0.058328,0.03004,1.0,1
3,1,2,0.1,2,1,1,,1.0,"[0.2001977692338378, 1.0, 0.2001977692338378, ...","[0.20019776923383792, 1.0, 0.20019776923383792...",0.058328,0.03004,1.0,1
4,1,2,0.1,2,1,1,,1.0,"[0.2001977692338378, 1.0, 0.2001977692338378, ...","[0.20019776923383792, 1.0, 0.20019776923383792...",0.058328,0.03004,1.0,1


In [26]:
df.to_parquet(path_shap)

In [28]:
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap_v0.parquet")

In [29]:
df.to_csv(path_shap)