# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

from sklearn.preprocessing import OneHotEncoder
import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.model.functions import train_and_evaluate_iforest

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [1]
trees = [1, 5, 25, 50, 75, 100, 125, 150, 175, 200]

## Allianz

----
**Dataset source**: Private dataset from Allianz Benelux, which contain the claims data from clients during 201801-202312

**Additional sources**:

----

In [6]:
dataset_id = 'allianz'

data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(42358, 48)

In [7]:
hyper['contamination'] = 0.1
hyper

{'contamination': 0.1, 'max_samples': 256, 'n_estimators': 100}

In [8]:
excluded_cols = []

### iForest

## Parameters

In [9]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")
path_shap = os.path.join(data_root, "outputs", f"{dataset_id}_shap.parquet")

fi_shap_all = pd.read_parquet(path_fi_shap)

## Model training with iterations HPO and FS

In [10]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, fi_df=fi_shap_all, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")

Iteration by tree number: 1
  Iteration by contamination: 0.1
    Number of featured: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
    Number of featured: 4
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
    Number of featured: 6
     Iteration by feat number: 2
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 4
     Iteration by feat number: 6
    Number of featured: 9
     Iteration by feat number: 2
     Iteration by feat number: 3
     Iteration by feat number: 5
     Iteration by feat number: 7
     Iteration by feat number: 9
    Number of featured: 11
     Iteration by feat number: 2
     Iteration by feat number: 4
     Iteration by feat number: 6
     Iteration by feat number: 8
     Iteration 

In [12]:
df_save = df.copy()

In [13]:
df = df.drop(columns=['model_stab_list', 'shap_stab_list'])

In [14]:
df.to_parquet(path_shap)