# iForest

In [14]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [15]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

import time

### Load enviroment variables

In [16]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [17]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.feature_selection.functions import fs_iforest_with_shap, shap_ranks, process_fi

## General parameters

In [18]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [1] 
trees = [25, 50, 100]
group = 10

excluded_cols = []

## Allinz claims data

----
**Dataset source**: Private dataset from Allianz Benelux, which contain the claims data from clients during 201801-202312

**Additional sources**:

----

In [19]:
dataset_id = 'allianz'
data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(42358, 36)

### iForest

## Parameters

In [20]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")

## Feature selection by SHAP

In [21]:
fs_shap, fi_shap, _ = fs_iforest_with_shap(data, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs)



In [22]:
fi_shap

Unnamed: 0,feature,value,per_value,cum_value,cum_value_percentage
0,C_CAU_50,0.365445,7.859328,0.365445,7.859328
1,C_FORM_infrequent_sklearn,0.329101,7.077706,0.694545,14.937034
2,C_FAM_PROD_A02,0.270197,5.810914,0.964743,20.747947
3,IND_PMT_DIR_Y,0.229788,4.94187,1.194531,25.689817
4,d_surv_sin_is_weekend_True,0.213701,4.595892,1.408232,30.285709
5,RESP_CIE_infrequent_sklearn,0.211525,4.549101,1.619757,34.83481
6,C_FORM_17,0.204248,4.392606,1.824005,39.227416
7,C_CAU_40,0.196947,4.235588,2.020952,43.463005
8,C_APPLN_MALUS_1,0.194823,4.189906,2.215775,47.65291
9,C_FORM_11,0.188479,4.05347,2.404255,51.70638


In [9]:
fi_shap_all = process_fi(fi_shap, group)

In [10]:
fi_shap_all

Unnamed: 0,n_feats,cum_value,cum_value_percentage,n_feats_percentage,feat_selected
0,2,0.560016,12.630282,5.882353,"[C_FORM_infrequent_sklearn, C_CAU_50]"
1,3,0.814389,18.367271,8.823529,"[C_FORM_infrequent_sklearn, C_CAU_50, C_FAM_PR..."
2,5,1.301811,29.360323,14.705882,"[C_FORM_infrequent_sklearn, C_CAU_50, C_FAM_PR..."
3,7,1.747532,39.41287,20.588235,"[C_FORM_infrequent_sklearn, C_CAU_50, C_FAM_PR..."
4,9,2.13953,48.253776,26.470588,"[C_FORM_infrequent_sklearn, C_CAU_50, C_FAM_PR..."
5,12,2.630259,59.321415,35.294118,"[C_FORM_infrequent_sklearn, C_CAU_50, C_FAM_PR..."
6,15,3.074816,69.347707,44.117647,"[C_FORM_infrequent_sklearn, C_CAU_50, C_FAM_PR..."
7,19,3.568848,80.489815,55.882353,"[C_FORM_infrequent_sklearn, C_CAU_50, C_FAM_PR..."
8,23,3.977493,89.70617,67.647059,"[C_FORM_infrequent_sklearn, C_CAU_50, C_FAM_PR..."
9,34,4.433912,100.0,100.0,"[C_FORM_infrequent_sklearn, C_CAU_50, C_FAM_PR..."


In [11]:
fi_shap_all.to_parquet(path_fi_shap)

In [9]:
fi_shap_all = pd.read_parquet(path_fi_shap)

In [13]:
fi_shap_all.feat_selected[10]

array(['C_FORM_infrequent_sklearn', 'C_CAU_50', 'C_FAM_PROD_A02',
       'IND_PMT_DIR_N', 'C_CAU_40', 'd_surv_sin_is_weekend_True',
       'C_APPLN_MALUS_1', 'IND_PMT_DIR_Y', 'C_FORM_11', 'RESP_CIE_2',
       'C_FORM_17', 'C_CAU_infrequent_sklearn', 'is_closed_True',
       'contract_age_in_days', 'RESP_CIE_infrequent_sklearn', 'C_CAU_69',
       'CNT_TY_GES_P', 'C_FORM_12', 'weekday_declaration',
       'C_FAM_PROD_A15', 'C_ET_2', 'C_FAM_PROD_A03',
       'reporting_delay_in_days', 'RESP_CIE_4',
       'C_FAM_PROD_infrequent_sklearn', 'weekday_surv_sin', 'C_ET_1',
       'C_NAT_SIN_M', 'd_dcl_is_weekend_True', 'RESP_CIE_1', 'C_ET_3',
       'has_judiciary_procedure_True', 'C_APPLN_MALUS_2',
       'C_APPLN_MALUS_3', 'C_ORGN_OUVT_C', 'is_serious_sinister_True'],
      dtype=object)