# iForest

In [1]:
import warnings
warnings.filterwarnings('ignore')

## General libraries

In [2]:
import os
import sys

import pandas as pd
import numpy as np

from os.path import join
import json
import datetime

import time

### Load enviroment variables

In [3]:
from dotenv import load_dotenv
load_dotenv('../.env')

code_root = os.environ['CODE_ROOT']
cfg_path = os.environ['CFG_PATH']
data_root = os.environ['DATA_ROOT']

sys.path.insert(0, code_root)

### Specific libraries

In [4]:
from src.load.functions import get_fs_dataset, fs_datasets_hyperparams

from src.model.functions import train_and_evaluate_iforest

## General parameters

In [5]:
n_iter_fs = 1
n_iter = 1
contamination_percentage = [1] 
trees = [2, 3, 4]

## ERP fraud data

**Dataset source**: https://www.informatik.uni-wuerzburg.de/datascience/projects/machine-learning-for-network-security-fraud-detection/deepscan-concluded/erp-fraud-data/

**Additional sources**:

Open ERP System Data For Occupational Fraud Detection

Occupational fraud is defined as abusing one's occupation through the deliberate abuse of an employing organization's assets, and it is estimated that companies lose 5% of their revenue to occupational fraud each year.

In our research project DeepScan, we aim to develop approaches to automatically detect this type of fraud in data recorded by Enterprise Ressource Planning (ERP) systems, that track large amounts of information of company operation. Since ERP system data is guarded by companies due to privacy and trade secrecy concerns, publicly available ERP system data is an important step for enabling reproducible and incremental progress in this domain.

In our work, we propose a data generation strategy that is able to generate synthetic ERP system data free of privacy and trade secret concerns through an existing serious game, ERPsim. We additionally describe different occupational fraud cases and commit them during data generation.

Here, we provide the data generated in five different runs of the ERPsim simulation. We offer both raw data and aggregated datasets that are ready to use for fraud detection algorithms such as machine learning approaches.

ERP fraud detection ERPsim dataset: Download (190MB)

The paper can be found here: Link (arxiv)

In [6]:
dataset_id = 'erp_fraud'

data = get_fs_dataset(dataset_id, data_root)
#hyper = fs_datasets_hyperparams(dataset_id)
data.shape

(37407, 57)

In [7]:
pd.pivot_table(data,
             values = 'Menge',
               index = 'y', 
              aggfunc = 'count')

Unnamed: 0_level_0,Menge
y,Unnamed: 1_level_1
0,37321
1,86


In [8]:
excluded_cols = []

### iForest

## Parameters

In [9]:
# path
path = os.path.join(data_root, "outputs", f"{dataset_id}_results_if.parquet")
path

'/Users/allianz/workspace_github_pers/ad_shap_stability/test/data/outputs/erp_fraud_results_if.parquet'

In [10]:
hyper = fs_datasets_hyperparams(dataset_id)
hyper

{'contamination': 0.1, 'max_samples': 256, 'n_estimators': 100}

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37407 entries, 0 to 37406
Data columns (total 57 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Bestandskonto                          37407 non-null  float64
 1   Bewertungsklasse                       37407 non-null  float64
 2   Einzelpostenanzeige moeglich           37407 non-null  float64
 3   Erfolgskontentyp                       37407 non-null  float64
 4   Geschaeftsbereich                      37407 non-null  float64
 5   Gruppenkennzeichen                     37407 non-null  int64  
 6   KZ EKBE                                37407 non-null  float64
 7   Kennzeichen: Posten nicht kopierbar ?  37407 non-null  float64
 8   Kostenstelle                           37407 non-null  float64
 9   Kreditkontr_Bereich                    37407 non-null  float64
 10  Laufende Kontierung                    37407 non-null  float64
 11  Pa

### Iforest full features

In [12]:
# Capture the start time
start_time = datetime.datetime.now()

df = train_and_evaluate_iforest(data, dataset_id=dataset_id, hyper=hyper, n_tree_estimators=trees, contamination_percentage=contamination_percentage, excluded_cols=excluded_cols, n_iter_fs=n_iter_fs, n_iter=n_iter)

# Capture the finish time
finish_time = datetime.datetime.now()

# Calculate the duration
duration = finish_time - start_time

print(f"Duration: {duration}")


Iteration by tree number: 2
  Iteration by contamination: 0.1
    Number of featured: 56
     Iteration by feat number: 11
     Iteration by feat number: 22
     Iteration by feat number: 33
     Iteration by feat number: 44
     Iteration by feat number: 56
Iteration by tree number: 3
  Iteration by contamination: 0.1
    Number of featured: 56
     Iteration by feat number: 11
     Iteration by feat number: 22
     Iteration by feat number: 33
     Iteration by feat number: 44
     Iteration by feat number: 56
Iteration by tree number: 4
  Iteration by contamination: 0.1
    Number of featured: 56
     Iteration by feat number: 11
     Iteration by feat number: 22
     Iteration by feat number: 33
     Iteration by feat number: 44
     Iteration by feat number: 56
Duration: 0:18:25.842958


In [19]:
df.head().sort_values('precision', ascending=False)

Unnamed: 0,n_estimators,max_feats,contamination,n_feats,n_iter,n_iter_fs,roc_auc,model_stab,model_stab_list,shap_stab,shap_stab_list,shap_stab_ad,f1_median,recall,precision,confusion_matrix
4,2,56,0.1,56,1,1,0.969657,0.716269,"[[0.99986256730319, 0.9970438229289682, 0.6896...","[0.39280174158718084, 0.431244665418376, 0.410...","[[0.42916691633914483, 0.696702763927013, 0.12...","[0.4490167322482156, 0.3687922466878193, 0.431...",0.062847,1.0,0.032443,1
2,2,33,0.1,56,1,1,0.958927,0.830151,"[[0.9999614073241069, 0.9960470967641789, 0.74...","[0.501648459978498, 0.5585001702255933, 0.5334...","[[0.8653286386189984, 0.98750034025187, 0.8056...","[0.5375450447511718, 0.5103155672361686, 0.505...",0.05251,1.0,0.026963,1
1,2,22,0.1,56,1,1,0.980547,0.830246,"[[0.9999765772921481, 0.8561867541213565, 0.44...","[0.5856192000182594, 0.6140002337055437, 0.646...","[[0.9002105574731415, 0.8851920574629581, 0.11...","[0.6248317398408085, 0.6060713237172727, 0.587...",0.046864,1.0,0.023994,1
3,2,44,0.1,56,1,1,0.944789,0.768116,"[[0.999999958268199, 0.9801437378262194, 0.721...","[0.44328009624513276, 0.44002748956291104, 0.4...","[[0.9373334944216796, 0.7589479207122194, 0.21...","[0.4857464117979797, 0.4461469494718381, 0.425...",0.037293,0.794118,0.019095,1
0,2,11,0.1,56,1,1,0.940355,0.870899,"[[0.9999999958313649, 0.9912341132775941, 0.99...","[0.7352352037024189, 0.7953175594782302, 0.841...","[[0.691535400087165, 0.9962296656188367, 0.0, ...","[0.7689302668734559, 0.7139897106330999, 0.760...",0.034771,0.794118,0.017775,1
