# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import shap
from shap_selection import feature_selection

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams
from src.feature_selection.functions import fs_iforest_with_shap, shap_ranks, process_fi

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [1] 
trees = [25, 50, 100]
group = 0

## Credit card

**Dataset source**: https://www.kaggle.com/mlg-ulb/creditcardfraud

**Additional sources:**

Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

Dal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon

Dal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE

Dal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)

Carcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier

Carcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing

Bertrand Lebichot, Yann-Aël Le Borgne, Liyun He, Frederic Oblé, Gianluca Bontempi Deep-Learning Domain Adaptation Techniques for Credit Cards Fraud Detection, INNSBDDL 2019: Recent Advances in Big Data and Deep Learning, pp 78-88, 2019

Fabrizio Carcillo, Yann-Aël Le Borgne, Olivier Caelen, Frederic Oblé, Gianluca Bontempi Combining Unsupervised and Supervised Learning in Credit Card Fraud Detection Information Sciences, 2019

Yann-Aël Le Borgne, Gianluca Bontempi Machine Learning for Credit Card Fraud Detection - Practical Handbook

In [6]:
dataset_id = 'erp_fraud'
data = get_fs_dataset(dataset_id, data_root)
hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(37407, 57)

In [7]:
pd.pivot_table(data,
             values = 'Menge',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Menge
y,Unnamed: 1_level_1
0,37321
1,86


In [8]:
data.describe()

Unnamed: 0,Bestandskonto,Bewertungsklasse,Einzelpostenanzeige moeglich,Erfolgskontentyp,Geschaeftsbereich,Gruppenkennzeichen,KZ EKBE,Kennzeichen: Posten nicht kopierbar ?,Kostenstelle,Kreditkontr_Bereich,...,Kreditkontr_betrag,Menge in BPME,Menge in ErfassME,Menge,Skontobasis,y,Belegnummer,Position,Transaktionsart,Erfassungsuhrzeit
count,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0,...,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0,37407.0
mean,0.576256,65.362098,0.759323,0.423744,0.195765,0.006924,0.021787,0.249873,0.804235,0.792258,...,745.307613,903.30312,1770.756858,1854.728495,636.6331,0.002299,2671314000.0,2.963804,4.388056,1970.172561
std,0.494157,437.971709,0.4275,0.494157,0.396794,0.082922,0.145991,0.432945,0.396794,0.405696,...,1647.819835,10437.692685,10883.305897,10871.196244,8995.055,0.047894,2196789000.0,2.298635,2.158177,1093.381231
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,90000000.0,1.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,90003460.0,1.0,4.0,1021.0
50%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,420.0,0.0,0.0,1400002000.0,2.0,6.0,1978.0
75%,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,460.0,479.0,0.0,0.0,4900002000.0,4.0,6.0,2912.0
max,1.0,3000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,12534.9,492000.0,492000.0,492000.0,1077397.0,1.0,5100000000.0,20.0,6.0,3827.0


In [9]:
excluded_cols = ['Belegnummer', 'Position', 'Transaktionsart', 'Erfassungsuhrzeit']

### iForest

## Parameters

In [10]:
path_fi_shap = os.path.join(data_root, "outputs", f"{dataset_id}_fi_shap")

## Feature selection by SHAP

In [11]:
fs_shap, fi_shap, _ = fs_iforest_with_shap(data, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs)



In [12]:
fi_shap_all = process_fi(fi_shap, 10)

In [13]:
fi_shap_all

Unnamed: 0,n_feats,cum_value,cum_value_percentage,n_feats_percentage,feat_selected
0,1,0.211892,7.591682,2.040816,[Betrag Hauswaehr]
1,3,0.580258,20.789508,6.122449,"[Betrag Hauswaehr, Menge, Profitcenter]"
2,5,0.844245,30.247681,10.204082,"[Betrag Hauswaehr, Menge, Profitcenter, Werk, ..."
3,7,1.092779,39.152174,14.285714,"[Betrag Hauswaehr, Menge, Profitcenter, Werk, ..."
4,10,1.426383,51.104552,20.408163,"[Betrag Hauswaehr, Menge, Profitcenter, Werk, ..."
5,13,1.702554,60.99922,26.530612,"[Betrag Hauswaehr, Menge, Profitcenter, Werk, ..."
6,16,1.928759,69.103703,32.653061,"[Betrag Hauswaehr, Menge, Profitcenter, Werk, ..."
7,21,2.217648,79.454055,42.857143,"[Betrag Hauswaehr, Menge, Profitcenter, Werk, ..."
8,28,2.511843,89.994498,57.142857,"[Betrag Hauswaehr, Menge, Profitcenter, Werk, ..."
9,49,2.791108,100.0,100.0,"[Betrag Hauswaehr, Menge, Profitcenter, Werk, ..."


In [14]:
fi_shap_all.to_parquet(path_fi_shap)