# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.feature_selection.functions import fs_iforest_with_shap, shap_ranks, process_fi

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8] 
trees = [25, 50, 100]
group = 0

## Data set

In [6]:
dataset_id = 'allianz'
data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(42358, 36)

In [7]:
excluded_cols = []

In [8]:
data

Unnamed: 0_level_0,weekday_surv_sin,weekday_declaration,reporting_delay_in_days,contract_age_in_days,d_surv_sin_is_weekend_True,d_dcl_is_weekend_True,is_closed_True,is_serious_sinister_True,has_judiciary_procedure_True,C_CAU_40,...,RESP_CIE_2,RESP_CIE_4,RESP_CIE_infrequent_sklearn,C_APPLN_MALUS_1,C_APPLN_MALUS_2,C_APPLN_MALUS_3,CNT_TY_GES_P,IND_PMT_DIR_N,IND_PMT_DIR_Y,C_ORGN_OUVT_C
NO_SIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20121062512,3,2,3688,-2269,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20141061929,6,4,3134,189,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20151060794,3,0,2594,17,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20151060795,6,2,2355,3212,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20171061308,1,2,1681,164,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20231021195,1,4,17,175,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20231021196,4,2,12,139,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20231021197,6,3,4,89,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20231021198,2,3,1,212,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


### iForest

## Parameters

In [9]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap.parquet")

## Feature selection by SHAP

In [10]:
fs_shap, fi_shap, _ = fs_iforest_with_shap(data, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs)



In [11]:
fi_shap_all = process_fi(fi_shap, 10)

In [12]:
fi_shap_all

Unnamed: 0,n_feats,cum_value,cum_value_percentage,n_feats_percentage,feat_selected
0,2,0.561446,12.542027,5.882353,"[C_CAU_50, d_surv_sin_is_weekend_True]"
1,3,0.826335,18.459324,8.823529,"[C_CAU_50, d_surv_sin_is_weekend_True, C_CAU_40]"
2,5,1.305672,29.167147,14.705882,"[C_CAU_50, d_surv_sin_is_weekend_True, C_CAU_4..."
3,7,1.768254,39.500673,20.588235,"[C_CAU_50, d_surv_sin_is_weekend_True, C_CAU_4..."
4,9,2.210064,49.370173,26.470588,"[C_CAU_50, d_surv_sin_is_weekend_True, C_CAU_4..."
5,12,2.754617,61.53485,35.294118,"[C_CAU_50, d_surv_sin_is_weekend_True, C_CAU_4..."
6,15,3.192019,71.30588,44.117647,"[C_CAU_50, d_surv_sin_is_weekend_True, C_CAU_4..."
7,18,3.568851,79.723858,52.941176,"[C_CAU_50, d_surv_sin_is_weekend_True, C_CAU_4..."
8,23,4.064477,90.795552,67.647059,"[C_CAU_50, d_surv_sin_is_weekend_True, C_CAU_4..."
9,34,4.476516,100.0,100.0,"[C_CAU_50, d_surv_sin_is_weekend_True, C_CAU_4..."


In [13]:
fi_shap_all.to_parquet(path_fi_shap)