# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.feature_selection.functions import fs_iforest_with_shap, shap_ranks, process_fi

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [0.8] 
trees = [25, 50, 100]
group = 0

## Data set

In [6]:
dataset_id = 'allianz'
data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(42358, 48)

In [7]:
excluded_cols = []

In [8]:
data

Unnamed: 0_level_0,weekday_surv_sin,weekday_declaration,reporting_delay_in_days,contract_age_in_days,d_surv_sin_is_weekend_True,d_dcl_is_weekend_True,is_closed_True,is_serious_sinister_True,has_judiciary_procedure_True,C_CAU_13,...,RESP_CIE_5,RESP_CIE_9,RESP_CIE_infrequent_sklearn,C_APPLN_MALUS_1,C_APPLN_MALUS_2,C_APPLN_MALUS_3,CNT_TY_GES_P,IND_PMT_DIR_N,IND_PMT_DIR_Y,C_ORGN_OUVT_C
NO_SIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20121062512,3,2,3688,-2269,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20141061929,6,4,3134,189,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20151060794,3,0,2594,17,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20151060795,6,2,2355,3212,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20171061308,1,2,1681,164,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20231021195,1,4,17,175,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20231021196,4,2,12,139,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20231021197,6,3,4,89,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
20231021198,2,3,1,212,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


### iForest

## Parameters

In [9]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")

## Feature selection by SHAP

In [10]:
fs_shap, fi_shap, _ = fs_iforest_with_shap(data, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs)



In [11]:
fi_shap_all = process_fi(fi_shap, 10)

In [12]:
fi_shap_all

Unnamed: 0,n_feats,cum_value,cum_value_percentage,n_feats_percentage,feat_selected
0,2,0.47116,10.10916,4.347826,"[C_APPLN_MALUS_1, C_CAU_40]"
1,4,0.914633,19.62428,8.695652,"[C_APPLN_MALUS_1, C_CAU_40, d_surv_sin_is_week..."
2,6,1.332647,28.593162,13.043478,"[C_APPLN_MALUS_1, C_CAU_40, d_surv_sin_is_week..."
3,9,1.923959,41.280298,19.565217,"[C_APPLN_MALUS_1, C_CAU_40, d_surv_sin_is_week..."
4,11,2.287247,49.074969,23.913043,"[C_APPLN_MALUS_1, C_CAU_40, d_surv_sin_is_week..."
5,14,2.782953,59.7108,30.434783,"[C_APPLN_MALUS_1, C_CAU_40, d_surv_sin_is_week..."
6,18,3.307418,70.963679,39.130435,"[C_APPLN_MALUS_1, C_CAU_40, d_surv_sin_is_week..."
7,22,3.727265,79.971873,47.826087,"[C_APPLN_MALUS_1, C_CAU_40, d_surv_sin_is_week..."
8,28,4.216176,90.461924,60.869565,"[C_APPLN_MALUS_1, C_CAU_40, d_surv_sin_is_week..."
9,46,4.660719,100.0,100.0,"[C_APPLN_MALUS_1, C_CAU_40, d_surv_sin_is_week..."


In [13]:
fi_shap_all.to_parquet(path_fi_shap)