In [None]:
import numpy as np
import pandas as pd 
import random 
import copy 
import warnings
import sys
import pickle
import pyreadr as py
import itertools

from tqdm import tqdm
from copy import deepcopy
from scipy.io import loadmat
import shap
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_recall_fscore_support, average_precision_score

sys.path.append('../../')
from ACME.ACME import ACME
from ACME.visual_utils import * 
sys.path.remove('../../')

warnings.filterwarnings('ignore')

# set seed for reproducibility
np.random.seed(0)
random.seed(0)

# AcME-AD to explain IF in TEP dataset

We subsample the original dataset to resort to a typical anomaly detection scenario where anomalies are rare. 

In [None]:
n_normal_simulations = 70 
n_faulty_simulations = 3

In [None]:
# load the data
normal_data = pd.read_csv('ad_industrial_datasets/TEP_FaultFree_Training_subsample_70_3.csv')
fault_data = pd.read_csv('ad_industrial_datasets/TEP_Faulty_Training_subsample_70_3_removedfirst20.csv')

In [None]:
# create 20 datasets, each one containing only 1 specific fault 
data = []
contaminations = []
for i in range(20): 
    fault_data_i = fault_data[fault_data['faultNumber'] == i+1].reset_index(drop=True)
    data.append(pd.concat([normal_data, fault_data_i], axis=0).reset_index(drop=True))
    contaminations.append(len(fault_data_i)/len(data[-1]))

In [None]:
features = data[0].columns[3:-1] 

## Explain fault 12
Performances are good on fault 12 and we have prior knowledge on the fact that xmeas 11 is 'root cause' of the fault. 
[Harinarayan, R. Rajesh Alias, and S. Mercy Shalinie. "XFDDC: eXplainable Fault Detection Diagnosis and Correction framework for chemical process systems." Process Safety and Environmental Protection 165 (2022): 463-474.]

In [None]:
data_12 = data[11].copy()
contamination = contaminations[11]

In [None]:
# hyperparameters tuning 
from sklearn.model_selection import ParameterGrid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_samples': [64, 128, 256]
}
best_avg_prec = 0
best_config = None

for config in ParameterGrid(param_grid):
    iforest = IsolationForest(contamination=contamination, random_state=0, n_jobs=-1, **config).fit(data_12[features])
    data_12['Prediction'] = iforest.predict(data_12[features])
    data_12['Prediction'] = data_12['Prediction'].apply(lambda x: 1 if x == -1 else 0)

    avg_prec = average_precision_score(data_12['Target'], data_12['Prediction'])
    if avg_prec > best_avg_prec: 
        best_avg_prec = avg_prec
        best_config = config

print("Best config: ", best_config, " | Best avg prec: ", best_avg_prec)

In [None]:
# define score function needed for AcME-AD
def if_score_function(model, X): 
    return 0.5 * (- model.decision_function(X) + 1)

# train isolation forest with the best config
ad_model = IsolationForest(contamination=contamination, random_state=0, n_jobs=-1, **best_config).fit(data_12[features])
data_12['Prediction'] = ad_model.predict(data_12[features])
data_12['Prediction'] = data_12['Prediction'].apply(lambda x: 1 if x == -1 else 0)
data_12['Score'] = if_score_function(ad_model, data_12[features])

## KernelSHAP explanation

In [None]:
data_to_explain = data_12[data_12['Prediction'] == 1].reset_index(drop=True)

In [None]:
# # uncomment to re-run KernelSHAP
# # sample the 10% of the dataset data_12 but making sure that the proportion between samples with 'Target' == 0 and samples with 'Target' == 1 is preserved
# data_12_normal = data_12[data_12['Target'] == 0].sample(frac=0.1, random_state=0)
# data_12_faulty = data_12[data_12['Target'] == 1].sample(frac=0.1, random_state=0)

# shap_background = pd.concat([data_12_normal, data_12_faulty], axis=0).reset_index(drop=True)

# def if_score_function_shap(X): 
#     return 0.5 * (-ad_model.decision_function(X) + 1) 
    
# shap_explainer = shap.KernelExplainer(if_score_function_shap, shap_background[features].values)
# shap_values = shap_explainer.shap_values(data_to_explain[features])
# df_shap_abs = pd.DataFrame(np.abs(shap_values), columns = features)

# shap_rankings = df_shap_abs.rank(axis=1, ascending=False, method="min")

In [None]:
# # save data
# shap_rankings.to_csv('results/TEP_IF_rankings_SHAPKERNEL.csv', index=False)

In [None]:
# load results we have obtained
shap_rankings = pd.read_csv('results/TEP_IF_rankings_SHAPKERNEL.csv')

In [None]:
shap_rank_counting = shap_rankings.apply(lambda x: x.value_counts()).fillna(0).astype(int)
shap_rank_counting = shap_rank_counting / shap_rank_counting.sum()

In [None]:
# plot overall feature importance 

visual_dict = {
    'xmeas_4': {"color": "limegreen", "pattern": ""}, 
    'xmeas_7': {'color': 'lightskyblue', 'pattern': '.'},  
    'xmeas_11':{"color": "midnightblue", "pattern": "x"},
    'xmeas_13':{"color": "crimson", "pattern": "/"},
    'xmeas_38':{"color":"mediumpurple" , "pattern": "\\"}, 
    'xmv_2': {"color": "aquamarine", "pattern": "+"}, 
    'xmeas_20': {"color": "rosyBrown", "pattern": "+"}, 
    'xmv_9': {'color': 'orange', 'pattern' :'.'}
}

stacked_barplot_fig = feature_importance_distribution_barplot(shap_rank_counting, n_positions=5, threshold=0.05, color_pattern_dict = visual_dict)
stacked_barplot_fig.show()
