In [1]:
import numpy as np
import pandas as pd 
import random 
import copy 
import warnings
import sys
import pickle
import pyreadr as py
import itertools

from tqdm import tqdm
from copy import deepcopy
from scipy.io import loadmat
from sklearn.ensemble import IsolationForest
from pyod.models.dif import DIF
from pyod.models.loda import LODA
from pyod.models.lof import LOF
from pyod.models.ecod import ECOD
from sklearn.metrics import precision_recall_fscore_support, average_precision_score

sys.path.append('../../')
from ACME.ACME import ACME
from ACME.visual_utils import * 
sys.path.remove('../../')

warnings.filterwarnings('ignore')

# set seed for reproducibility
np.random.seed(0)
random.seed(0)

# AcME-AD to explain IF in TEP dataset

We subsample the original dataset to resort to a typical anomaly detection scenario where anomalies are rare. 

In [2]:
n_normal_simulations = 70 
n_faulty_simulations = 3

In [3]:
# load the data
normal_data = pd.read_csv('ad_industrial_datasets/TEP_FaultFree_Training_subsample_70_3.csv')
fault_data = pd.read_csv('ad_industrial_datasets/TEP_Faulty_Training_subsample_70_3_removedfirst20.csv')

In [4]:
# create 20 datasets, each one containing only 1 specific fault 
data = []
contaminations = []
for i in range(20): 
    fault_data_i = fault_data[fault_data['faultNumber'] == i+1].reset_index(drop=True)
    data.append(pd.concat([normal_data, fault_data_i], axis=0).reset_index(drop=True))
    contaminations.append(len(fault_data_i)/len(data[-1]))

In [5]:
features = data[0].columns[3:-1] 

## Training
Train Deep Isolation Forest on each dataset and evaluate it. 

In [6]:
results = []
for i in tqdm(range(20)): 
    ad_model = LODA(contamination=contaminations[i]).fit(data[i][features])
    data[i]['Prediction'] = ad_model.labels_
    

    prec, rec, f1, _ = precision_recall_fscore_support(data[i]['Target'], data[i]['Prediction'], average='binary')
    avg_prec = average_precision_score(data[i]['Target'], data[i]['Prediction'])

    results.append({'Fault': i+1, 'Precision': prec, 'Recall': rec, 'F1': f1, 'Average Precision': avg_prec})

results_LODA = pd.DataFrame(results)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:05<00:00,  3.73it/s]


In [7]:
results = []
for i in tqdm(range(20)): 
    ad_model = LOF(contamination=contaminations[i]).fit(data[i][features])
    data[i]['Prediction'] = ad_model.labels_
    

    prec, rec, f1, _ = precision_recall_fscore_support(data[i]['Target'], data[i]['Prediction'], average='binary')
    avg_prec = average_precision_score(data[i]['Target'], data[i]['Prediction'])

    results.append({'Fault': i+1, 'Precision': prec, 'Recall': rec, 'F1': f1, 'Average Precision': avg_prec})

results_LOF = pd.DataFrame(results)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:19<00:00,  1.03it/s]


In [8]:
results = []
for i in tqdm(range(20)): 
    ad_model = ECOD(contamination=contaminations[i]).fit(data[i][features])
    data[i]['Prediction'] = ad_model.labels_
    

    prec, rec, f1, _ = precision_recall_fscore_support(data[i]['Target'], data[i]['Prediction'], average='binary')
    avg_prec = average_precision_score(data[i]['Target'], data[i]['Prediction'])

    results.append({'Fault': i+1, 'Precision': prec, 'Recall': rec, 'F1': f1, 'Average Precision': avg_prec})

results_ECOD = pd.DataFrame(results)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:08<00:00,  2.46it/s]


In [9]:
results = []
for i in tqdm(range(20)): 
    ad_model = DIF(contamination=contaminations[i]).fit(data[i][features])
    data[i]['Prediction'] = ad_model.labels_
    

    prec, rec, f1, _ = precision_recall_fscore_support(data[i]['Target'], data[i]['Prediction'], average='binary')
    avg_prec = average_precision_score(data[i]['Target'], data[i]['Prediction'])

    results.append({'Fault': i+1, 'Precision': prec, 'Recall': rec, 'F1': f1, 'Average Precision': avg_prec})

results_DIF = pd.DataFrame(results)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [17:20<00:00, 52.02s/it]


In [10]:
display(results_LODA)
display(results_LOF)
display(results_ECOD)
display(results_DIF)

Unnamed: 0,Fault,Precision,Recall,F1,Average Precision
0,1,0.959722,0.959722,0.959722,0.922658
1,2,0.904,0.904,0.904,0.821161
2,3,0.058,0.058,0.058,0.042076
3,4,0.130667,0.130667,0.130667,0.0528
4,5,0.353333,0.353333,0.353333,0.15142
5,6,0.954,0.954,0.954,0.912006
6,7,0.668,0.668,0.668,0.459868
7,8,0.8,0.8,0.8,0.648219
8,9,0.07,0.07,0.07,0.043119
9,10,0.219333,0.219333,0.219333,0.080189


Unnamed: 0,Fault,Precision,Recall,F1,Average Precision
0,1,0.910417,0.910417,0.910417,0.832399
1,2,0.866667,0.866667,0.866667,0.756591
2,3,0.064667,0.064667,0.064667,0.04262
3,4,0.102,0.102,0.102,0.047308
4,5,0.344,0.344,0.344,0.145295
5,6,0.928667,0.928667,0.928667,0.865353
6,7,0.406,0.406,0.406,0.189247
7,8,0.701333,0.701333,0.701333,0.504142
8,9,0.064,0.064,0.064,0.042562
9,10,0.223333,0.223333,0.223333,0.081796


Unnamed: 0,Fault,Precision,Recall,F1,Average Precision
0,1,0.119444,0.119444,0.119444,0.049064
1,2,0.171333,0.171333,0.171333,0.06341
2,3,0.055333,0.055333,0.055333,0.041884
3,4,0.088667,0.088667,0.088667,0.045314
4,5,0.237333,0.237333,0.237333,0.08767
5,6,0.117333,0.117333,0.117333,0.050041
6,7,0.644667,0.644667,0.644667,0.430198
7,8,0.412667,0.412667,0.412667,0.194431
8,9,0.058667,0.058667,0.058667,0.042127
9,10,0.3,0.3,0.3,0.118767


Unnamed: 0,Fault,Precision,Recall,F1,Average Precision
0,1,0.792361,0.792361,0.792361,0.636041
1,2,0.869333,0.869333,0.869333,0.76111
2,3,0.066667,0.066667,0.066667,0.042801
3,4,0.092667,0.092667,0.092667,0.045875
4,5,0.384,0.384,0.384,0.172771
5,6,0.908,0.908,0.908,0.828245
6,7,0.584,0.584,0.584,0.358152
7,8,0.878,0.878,0.878,0.775898
8,9,0.072667,0.072667,0.072667,0.04339
9,10,0.242667,0.242667,0.242667,0.09001
