In [1]:
import numpy as np
import pandas as pd 
import random 
import copy 
import warnings
import sys
import pickle

from tqdm import tqdm
from copy import deepcopy
from pyod.models.loda import LODA

sys.path.append('../../')
from ACME.ACME import ACME
from ACME.visual_utils import * 
sys.path.remove('../../')

warnings.filterwarnings('ignore')

# set seed for reproducibility
np.random.seed(0)
random.seed(0)

# AcME-AD to explain LODA in PIADE sequences dataset

In [2]:
# import data
data = pd.read_csv('ad_industrial_datasets/piade_sequences_1h_data.csv')
data = data.drop(columns=['interval_start'])
data = data.fillna(0)

features = [c for c in data.columns if c != 'equipment_ID']

In [3]:
# split data based on equipment ID
equipments = data['equipment_ID'].unique()
data_equipment = [data[data['equipment_ID'] ==e] for e in equipments]

# keep only equipment 2 for the example, select a different equipment if you want
data = data_equipment[1]

## Model training

Because it is an unsupervised task, no contamination is available. Since on the other notebook, IsolationForest with Sklearn implementation automatically compute the threshold, we choose a contamination value that_ 
1. yields to an acceptable number of anomalies when deployed in real scenarios; 
2. produces a number of anomalies close to the one produced by IF with automatic threshold.

In [4]:
ad_model = LODA(contamination=0.01).fit(data[features])
data['Prediction'] = ad_model.labels_

raw_scores = ad_model.decision_scores_
EPS = 1e-1
l = max(np.abs(np.max(raw_scores)-ad_model.threshold_), np.abs(np.min(raw_scores)-ad_model.threshold_)) + EPS
lb = ad_model.threshold_ - l
ub = ad_model.threshold_ + l
data['Score'] = (raw_scores - lb) / (ub - lb)

def score_function(model, X): 
    return (model.decision_function(X) - lb) / (ub - lb)

## Explanations

In [5]:
data_to_explain = data[data['Prediction'] == 1]
print('Number of anomalies:', data_to_explain.shape[0])

Number of anomalies: 28


Compute and save explanations

In [6]:
# acme_rankings = pd.DataFrame(columns=features)
# acme_local_explanations = {}

# acme_exp = ACME(ad_model, 'Score', features=features, task = 'ad', score_function=score_function)
# acme_exp = acme_exp.explain(data, robust = True)

# for j in tqdm(range(len(data_to_explain))):
#     local_explanation = acme_exp.explain_local(data_to_explain.iloc[j])
#     feature_table = local_explanation.feature_importance(local=True, weights={"delta":0.3, "ratio":0.2, "change":0.3, "distance":0.2})
#     acme_rankings.loc[j] = feature_table['importance'].rank(ascending=False, method='min')
#     acme_local_explanations[j] = local_explanation

In [7]:
# load results 
with open('results/piade_loda_acme_rankings_equip2.pkl', 'rb') as f:
    acme_rankings = pickle.load(f)

with open('results/piade_loda_acme_local_explanations_equip2.pkl', 'rb') as f:
    acme_local_explanations = pickle.load(f)

Plot the stacked barplot.

Note: features that are ranked in the k-th positon less than t% of the anomalies are merged in a unique 'other' group. In this case t% = 5%, but different threshold can be taken depending on the application. 

In [8]:
# custom colors for the results in the paper
# note that they are valid only for equipment 2.
feature_colors = {
    "#changes": {"color": "midnightblue", "pattern": ""},
    "%idle": {"color": "limegreen", "pattern": "+"},
    "A_004": {"color": "turquoise", "pattern": "-"},
    "A_005": {"color": "#858be3", "pattern": ""},
    "A_008": {"color": "#ff8200", "pattern": "-"},
    "A_010": {"color": "deepskyblue", "pattern": "."},
    "A_012": {"color": "#a777f1", "pattern": "\\"},
    "A_017": {"color": "lightslategray", "pattern": "|"},
    "A_020": {"color": "#665cfa", "pattern": "x"},
    "A_027": {"color": "#008bff", "pattern": "+"},
    "count_sum": {"color": "crimson", "pattern": ""},
    "downtime/downtime": {"color": "#4af5f9", "pattern": "/"},
    "idle/idle": {"color": "wheat", "pattern": "x"},
    "idle/performance_loss": {"color": "lavender", "pattern": "."},
    "performance_loss/performance_loss": {"color": 'rosyBrown', "pattern": "|"},
    "production/production": {"color": "steelBlue", "pattern": ""},
    "production/scheduled_downtime": {"color": "#00cc7c", "pattern": ""},
    "scheduled_downtime/downtime": {"color": "#ab63fa", "pattern": "."},
}

In [9]:
acme_rank_counting = acme_rankings.apply(pd.Series.value_counts).fillna(0).astype(int)
acme_rank_counting = acme_rank_counting / acme_rank_counting.sum()

acme_stacked_barplot_fig = feature_importance_distribution_barplot(acme_rank_counting, n_positions = 5, threshold=0.05, color_pattern_dict =feature_colors )
acme_stacked_barplot_fig.show()

Local what-if analysis tool

In [10]:
k = random.choice(list(acme_local_explanations.keys()))
acme_loc_summary_plot = acme_local_explanations[k].summary_plot(local=True, n_features = 10)
# acme_loc_summary_plot.update_layout(width = 100 * 15, height = 100 * 6, font = dict(size=18))
acme_loc_summary_plot.show()

Using default weights for anomaly detection feature importance
