In [None]:
import numpy as np
import pandas as pd 
import random 
import copy 
import warnings
import sys
import pickle
import pyreadr as py
import itertools
import time

from tqdm import tqdm
from copy import deepcopy
from scipy.io import loadmat
import shap
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_recall_fscore_support, average_precision_score

sys.path.append('../../')
from ACME.ACME import ACME
from ACME.visual_utils import * 
sys.path.remove('../../')

warnings.filterwarnings('ignore')

# set seed for reproducibility
np.random.seed(0)
random.seed(0)

# AcME-AD to explain IF in TEP dataset

We subsample the original dataset to resort to a typical anomaly detection scenario where anomalies are rare. 

In [None]:
n_normal_simulations = 70 
n_faulty_simulations = 3

In [None]:
# load the data
normal_data = pd.read_csv('ad_industrial_datasets/TEP_FaultFree_Training_subsample_70_3.csv')
fault_data = pd.read_csv('ad_industrial_datasets/TEP_Faulty_Training_subsample_70_3_removedfirst20.csv')

In [None]:
# create 20 datasets, each one containing only 1 specific fault 
faulty_data, data = [], []
for i in range(20): 
    faulty_data.append(fault_data[fault_data['faultNumber'] == i+1].reset_index(drop=True))
    data.append(pd.concat([normal_data, faulty_data[i]]).reset_index(drop=True))

In [None]:
features = data[0].columns[3:-1] 
contamination = faulty_data[0].shape[0] / data[0].shape[0]

## Explain fault 12
Performances are good on fault 12 and we have prior knowledge on the fact that xmeas 11 is 'root cause' of the fault. 
[Harinarayan, R. Rajesh Alias, and S. Mercy Shalinie. "XFDDC: eXplainable Fault Detection Diagnosis and Correction framework for chemical process systems." Process Safety and Environmental Protection 165 (2022): 463-474.]

In [None]:
data_12 = data[11].copy()
best_config = {'max_samples': 256, 'n_estimators': 200}

In [None]:
# define score function needed for AcME-AD
def if_score_function(model, X): 
    return 0.5 * (- model.decision_function(X) + 1)

# train isolation forest with the best config
ad_model = IsolationForest(contamination=contamination, random_state=0, n_jobs=-1, **best_config).fit(data_12[features])
data_12['Prediction'] = ad_model.predict(data_12[features])
data_12['Prediction'] = data_12['Prediction'].apply(lambda x: 1 if x == -1 else 0)
data_12['Score'] = if_score_function(ad_model, data_12[features])

In [None]:
# Confusion matrix 
n_normal_as_faulty = data_12[(data_12['Target'] == 0) & (data_12['Prediction'] == 1)].shape[0]
n_faulty_as_faulty = data_12[(data_12['Target'] == 1) & (data_12['Prediction'] == 1)].shape[0]
n_normal_as_normal = data_12[(data_12['Target'] == 0) & (data_12['Prediction'] == 0)].shape[0]
n_faulty_as_normal = data_12[(data_12['Target'] == 1) & (data_12['Prediction'] == 0)].shape[0]

print("Normal as faulty: ", n_normal_as_faulty, " | Faulty as faulty: ", n_faulty_as_faulty, " | Normal as normal: ", n_normal_as_normal, " | Faulty as normal: ", n_faulty_as_normal)

conf_matrix = np.array([[n_normal_as_normal, n_faulty_as_normal], [n_normal_as_faulty, n_faulty_as_faulty]])
print(conf_matrix)

## Explanation time

In [None]:
data_to_explain = data_12[data_12['Prediction'] == 1].reset_index(drop=True).loc[0].squeeze()

In [None]:
acme_time = time.time()
acme = ACME(ad_model, 'Score', features, task='ad', score_function=if_score_function)
acme = acme.explain(data_12, robust = True)
local_exp = acme.explain_local(data_to_explain)
local_exp.feature_importance(local=True, weights={'delta':0.3, 'change':0.3, 'distance':0.2, 'ratio':0.2})
acme_time = time.time() - acme_time
print("AcME-AD time: ", acme_time)


In [None]:
shap_backgrounds = []
for i in [0.05, 0.1, 0.2, 0.5]: 
    data_12_normal = data_12[data_12['Target'] == 0].sample(frac=i, random_state=0)
    data_12_faulty = data_12[data_12['Target'] == 1].sample(frac=i, random_state=0)

    shap_backgrounds.append(pd.concat([data_12_normal, data_12_faulty], axis=0).reset_index(drop=True))

def if_score_function_shap(X): 
    return 0.5 * (-ad_model.decision_function(X) + 1) 

shap_times = []
for i in range(len(shap_backgrounds)): 
    shap_time = time.time()
    shap_explainer = shap.KernelExplainer(if_score_function_shap, shap_backgrounds[i][features].values)
    shap_values = shap_explainer.shap_values(data_to_explain[features])
    shap_time = time.time() - shap_time
    print("SHAP time: ", shap_time)
    shap_times.append(shap_time)