<h1 style="color:#3da1da;font-size: 300%;" id="timeshap tutorial" align="center"  >TimeSHAP Tutorial - TensorFlow - AReM dataset</h1><p>&nbsp;

<a id='top_cell'></a>

## Table of contents
1. [Data Processing](#1.-Data-Processing)
  1. [Data Loading](#1.1-Data-Loading)
  2. [Data Treatment](#1.2-Data-Treatment)
2. [Model](#2.-Model)
  1. [Model Definition](#2.1-Model-Definition)
  2. [Model Training](#2.2-Model-Training)
3. [TimeSHAP](#3.-TimeSHAP)
  1. [Local Explanations](#3.1-Local-Explanations)
  2. [Global Explanations](#3.2-Global-Explanations)
  3. [Individual Plots](#3.3-Individual-Plots)
    

# TimeSHAP

TimeSHAP is a model-agnostic, recurrent explainer that builds upon KernelSHAP and extends it to the sequential domain. 

TimeSHAP computes local event/timestamp- feature-, and cell-level attributions. 
    
Aditionally TimeSHAP also computes global event- and feature-level explanations.
    
As sequences can be arbitrarily long, TimeSHAP also implements a pruning algorithm based on Shapley Values, 
that finds a subset of consecutive, recent events that contribute the most to the decision.

---
# 1. Data-Processing
---

In [1]:
import pandas as pd
import numpy as np
import os
import re

np.random.seed(42)

import warnings
warnings.filterwarnings('ignore')

from timeshap import __version__
__version__

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## 1.2 Data Treatment

### Separate in train and test

In [2]:
# choose ids to use for test
df = pd.read_csv('SMD_Dataset/machine_tshap.csv')
step_size = 24

In [3]:
df.head(10)

Unnamed: 0,cpu_r,load_1,load_5,load_15,mem_shmem,mem_u,mem_u_e,total_mem,disk_q,disk_r,...,retransegs,tcp_timeouts,udp_in_dg,udp_out_dg,udp_rcv_buf_errs,udp_snd_buf_errs,label,seq_id,tcpu_r,timestep
0,0.075269,0.065678,0.070234,0.074332,0,0.933333,0.274011,0,0.031081,0.0,...,0.000386,3.4e-05,0.064432,0.0645,0,0,0,1,0.086022,1
1,0.086022,0.080508,0.075808,0.076655,0,0.930769,0.274953,0,0.031081,0.000122,...,0.000386,2.2e-05,0.065228,0.065224,0,0,0,1,0.086022,2
2,0.075269,0.064619,0.071349,0.074332,0,0.928205,0.274953,0,0.03094,0.000366,...,0.000386,4.5e-05,0.067111,0.067178,0,0,0,1,0.086022,3
3,0.086022,0.048729,0.063545,0.070848,0,0.928205,0.27307,0,0.02725,0.000244,...,0.0,3.4e-05,0.066676,0.066744,0,0,0,1,0.086022,4
4,0.086022,0.051907,0.06243,0.070848,0,0.933333,0.274011,0,0.03094,0.000244,...,0.000386,2.2e-05,0.066604,0.066671,0,0,0,1,0.086022,5
5,0.086022,0.085805,0.069119,0.072009,0,0.928205,0.27307,0,0.030798,0.000244,...,0.000386,3.4e-05,0.0674,0.067468,0,0,0,1,0.086022,6
6,0.086022,0.070975,0.06689,0.070848,0,0.925641,0.272128,0,0.029662,0.000611,...,0.000386,2.2e-05,0.065301,0.065368,0,0,0,1,0.086022,7
7,0.086022,0.059322,0.063545,0.068525,0,0.925641,0.270245,0,0.031507,0.0,...,0.000386,3.4e-05,0.064577,0.064572,0,0,0,1,0.086022,8
8,0.086022,0.043432,0.056856,0.066202,0,0.928205,0.270245,0,0.028101,0.0,...,0.0,4.5e-05,0.067255,0.067323,0,0,0,1,0.086022,9
9,0.086022,0.050847,0.055741,0.065041,0,0.930769,0.270245,0,0.031933,0.000122,...,0.000386,3.4e-05,0.06588,0.065875,0,0,0,1,0.086022,10


In [4]:
df[['label']].value_counts()

label
0        371400
Name: count, dtype: int64

In [5]:
df = df[['cpu_r', 'load_1', 'load_5', 'load_15', 'mem_u', 'mem_u_e',
       'disk_r', 'disk_rb', 'disk_w',  'disk_wb', 'eth1_fi', 'eth1_f0',
       'eth1_pi', 'eth1_p0', 'tcp_tw', 'tcp_use', 'seq_id',
       'timestep', 'tcpu_r']]

#'active_opens', 'curr_estab', 'disk_q','disk_svc', 'disk_u','disk_wa',
# 'in_segs', 'listen_overflows', 'out_rsts', 'out_segs',
# 'passive_opens', 'retransegs', 'tcp_timeouts', 'udp_in_dg',
# 'udp_out_dg'

In [6]:
train_sequence_end = int(0.8*len(df)/step_size)

In [7]:
df_len = len(df)
train_len = train_sequence_end*step_size
d_train =  df[:train_len]
d_test = df[train_len:]

In [8]:
print(len(d_train))
print(len(d_test))

297120
74280


In [9]:
train_sequence_end

12380

###  Normalize Features

In [10]:
# class NumericalNormalizer:
#     def __init__(self, fields: list):
#         self.metrics = {}
#         self.fields = fields

#     def fit(self, df: pd.DataFrame ) -> list:
#         means = df[self.fields].mean()
#         std = df[self.fields].std()
#         for field in self.fields:
#             field_mean = means[field]
#             field_stddev = std[field]
#             self.metrics[field] = {'mean': field_mean, 'std': field_stddev}

#     def transform(self, df: pd.DataFrame) -> pd.DataFrame:
#         # Transform to zero-mean and unit variance.
#         for field in self.fields:
#             f_mean = self.metrics[field]['mean']
#             f_stddev = self.metrics[field]['std']
#             # OUTLIER CLIPPING to [avg-3*std, avg+3*avg]
#             df[field] = df[field].apply(lambda x: f_mean - 3 * f_stddev if x < f_mean - 3 * f_stddev else x)
#             df[field] = df[field].apply(lambda x: f_mean + 3 * f_stddev if x > f_mean + 3 * f_stddev else x)
#             if f_stddev > 1e-5:
#                 df[f'p_{field}_normalized'] = df[field].apply(lambda x: ((x - f_mean)/f_stddev))
#             else:
#                 df[f'p_{field}_normalized'] = df[field].apply(lambda x: x * 0)
#         return df

In [11]:
raw_model_features = ['cpu_r', 'load_1', 'load_5', 'load_15', 'mem_u', 'mem_u_e',
        'disk_r', 'disk_rb', 'disk_w', 'disk_wb', 'eth1_fi', 'eth1_f0',
       'eth1_pi', 'eth1_p0', 'tcp_tw', 'tcp_use']

# 'active_opens', 'curr_estab', 'disk_q','disk_svc','disk_wa', 'disk_u',
# 'in_segs', 'listen_overflows', 'out_rsts', 'out_segs',
# 'passive_opens', 'retransegs', 'tcp_timeouts', 'udp_in_dg',
# 'udp_out_dg'
# # #all features are numerical
# # normalizor = NumericalNormalizer(raw_model_features)
# # normalizor.fit(d_train)
# # d_train_normalized = normalizor.transform(d_train)
# # d_test_normalized = normalizor.transform(d_test)


In [12]:
d_train_normalized = d_train
d_test_normalized = d_test

### Features

In [13]:
#model_features = [f"p_{x}_normalized" for x in raw_model_features]
model_features = raw_model_features
time_feat = 'timestep'
label_feat = 'tcpu_r'
sequence_id_feat = 'seq_id'
plot_feats = {
    'cpu_r':'cpu_r',
    'load_1':'load_1',
    'load_5':'load_5',
    'load_15':'load_15',
    'mem_u':'mem_u',
    'mem_u_e':'mem_u_e',
    'disk_r':'disk_r', 
    'disk_rb':'disk_rb',
    'disk_w':'disk_w',
    'disk_wb':'disk_wb', 
    'eth1_fi':'eth1_fi',
    'eth1_f0':'eth1_f0',
    'eth1_pi':'eth1_pi',
    'eth1_p0':'eth1_p0', 
    'tcp_tw':'tcp_tw',
    'tcp_use':'tcp_use'
}
    # 'disk_u':'disk_u',
    # 'disk_q':'disk_q', 
    # 'disk_wa':'disk_wa',
    # 'disk_svc':'disk_svc',
    # 'active_opens':'active_opens',
    # 'curr_estab':'curr_estab',
    # 'in_segs':'in_segs',
    # 'listen_overflows':'listen_overflows', 
    # 'out_rsts':'out_rsts', 
    # 'out_segs':'out_segs',
    # 'passive_opens':'passive_opens', 
    # 'retransegs':'retransegs',
    # 'tcp_timeouts':'tcp_timeouts',
    # 'udp_in_dg':'udp_in_dg',
    # 'udp_out_dg':'udp_out_dg'
# plot_feats = {
#     'p_cpu_r_normalized':'cpu_r', 
#     'p_load_1_normalized':'load_1',
#     'p_load_5_normalized':'load_5',
#     'p_load_15_normalized':'load_15',
#     'p_mem_u_normalized':'mem_u',
#     'p_mem_u_e_normalized':'mem_u_e',
#     'p_disk_q_normalized':'disk_q', 
#     'p_disk_r_normalized':'disk_r', 
#     'p_disk_rb_normalized':'disk_rb',
#     'p_disk_svc_normalized':'disk_svc',
#     'p_disk_u_normalized':'disk_u',
#     'p_disk_w_normalized':'disk_w',
#     'p_disk_wa_normalized':'disk_wa',
#     'p_disk_wb_normalized':'disk_wb', 
#     'p_eth1_fi_normalized':'eth1_fi',
#     'p_eth1_f0_normalized':'eth1_f0',
#     'p_eth1_pi_normalized':'eth1_pi',
#     'p_eth1_p0_normalized':'eth1_p0', 
#     'p_tcp_tw_normalized':'tcp_tw',
#     'p_tcp_use_normalized':'tcp_use',
#     'p_active_opens_normalized':'active_opens',
#     'p_curr_estab_normalized':'curr_estab',
#     'p_in_segs_normalized':'in_segs',
#     'p_listen_overflows_normalized':'listen_overflows', 
#     'p_out_rsts_normalized':'out_rsts', 
#     'p_out_segs_normalized':'out_segs',
#     'p_passive_opens_normalized':'passive_opens', 
#     'p_retransegs_normalized':'retransegs',
#     'p_tcp_timeouts_normalized':'tcp_timeouts',
#     'p_udp_in_dg_normalized':'udp_in_dg',
#     'p_udp_out_dg_normalized':'udp_out_dg'
# }

In [14]:
print(len(d_train_normalized))

297120


This example notebook requires TensorFlow!

Install it if you haven't already:
```
!pip install tensorflow
```

In [15]:
# df_3 = pd.read_csv('Stocks\IBM.csv')
# df_3 = df_3[-2000:]
# df_3.reset_index()
# df_3 = df_3[['Open','Close', 'High', 'Low','Adj Close', 'Volume']]
# df_3.rename(columns = {'Adj Close':'AdjClose'}, inplace = True)

# df_3_len = len(df_3)
# split_len = int(0.8*df_3_len)
# df_3_train =  df_3[:split_len]
# df_3_test = df_3[split_len:]

# raw_model_features = ['Open','Close','High','Low','AdjClose', 'Volume']
# #all features are numerical
# raw_model_features
# normalizor = NumericalNormalizer(raw_model_features)
# normalizor.fit(df_3_train)
# df_3_train_normalized = normalizor.transform(df_3_train)
# df_3_test_normalized = normalizor.transform(df_3_test)

# df_3_train_normalized.reset_index(drop=True, inplace=True)
# df_3_test_normalized.reset_index(drop=True, inplace=True)

# df_3_test_normalized = df_3_test_normalized[['p_Open_normalized','p_Close_normalized','p_High_normalized','p_Low_normalized','p_AdjClose_normalized','p_Volume_normalized']]
# df_3_train_normalized = df_3_train_normalized[['p_Open_normalized','p_Close_normalized','p_High_normalized','p_Low_normalized','p_AdjClose_normalized','p_Volume_normalized']]

In [16]:
def df_to_numpy(df, model_feats, label_feat, group_by_feat, timestamp_Feat):
    sequence_length = len(df[timestamp_Feat].unique())

    data_tensor = np.zeros(
        (len(df[group_by_feat].unique()), sequence_length, len(model_feats)))
    labels_tensor = np.zeros((len(df[group_by_feat].unique()), 1))

    for i, name in enumerate(df[group_by_feat].unique()):
        name_data = df[df[group_by_feat] == name]
        sorted_data = name_data.sort_values(timestamp_Feat)

        data_x = sorted_data[model_feats].values
        labels = sorted_data[label_feat].values
        #assert labels.sum() == 0 or labels.sum() == len(labels)
        data_tensor[i, :, :] = data_x
        labels_tensor[i, :] = labels[0]
    return data_tensor, labels_tensor

In [17]:
X_train, y_train = df_to_numpy(d_train_normalized, model_features, label_feat, sequence_id_feat, time_feat)

X_test, y_test = df_to_numpy(d_test_normalized, model_features, label_feat, sequence_id_feat, time_feat)

In [18]:

print("Training X shape: ", X_train.shape)
print("Training Y shape: ", y_train.shape)

print("Test X shape: ", X_test.shape)
print("Test Y shape: ", y_test.shape)

Training X shape:  (12380, 24, 16)
Training Y shape:  (12380, 1)
Test X shape:  (3095, 24, 16)
Test Y shape:  (3095, 1)


In [19]:
# def generate_sequences(data, window_size):
#   _l = len(data) 
#   Xs = []
#   Ys = []
#   for i in range(0, (_l - window_size)):
#     Xs.append(np.array(data[i:i+window_size]))
#     #print(data[i+window_size,:])
#     Ys.append([data['p_Close_normalized'][i+window_size]])
#   return np.array(Xs), np.array(Ys)

# TIME_STEPS = 5
# X_train, y_train= generate_sequences(df_3_train_normalized, TIME_STEPS)
# print("Training input shape: ", X_train.shape)
# print("Training input shape: ", y_train.shape)

# X_test, y_test = generate_sequences(df_3_test_normalized, TIME_STEPS)

# print("Test input shape: ", X_test.shape)
# print("Test input shape: ", y_test.shape)

___
# 2. Model


This example notebook requires Tensorflow!

Install it if you haven't already:
```
!pip install tensorflow
```

## 2.1 Model Definition

In [22]:
import tensorflow as tf

inputs = tf.keras.layers.Input(shape=(None, 16))
lstm1 = tf.keras.layers.LSTM(64)(inputs)
ff1 = tf.keras.layers.Dense(64, activation='relu')(lstm1)
ff2 = tf.keras.layers.Dense(1)(ff1)
model = tf.keras.models.Model(inputs=inputs, outputs=ff2)

## 2.2 Model Training

In [23]:
model.compile(loss='mse',
              optimizer=tf.keras.optimizers.Adam(0.01))

model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/30


2024-09-02 22:51:44.877095: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 0.0045 - val_loss: 3.3055e-04
Epoch 2/30
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - loss: 7.9297e-05 - val_loss: 1.7669e-04
Epoch 3/30
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - loss: 7.5748e-05 - val_loss: 1.3324e-04
Epoch 4/30
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - loss: 6.6787e-05 - val_loss: 2.1369e-04
Epoch 5/30
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 6.0324e-05 - val_loss: 1.4924e-04
Epoch 6/30
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - loss: 5.8142e-05 - val_loss: 1.6583e-04
Epoch 7/30
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - loss: 5.6397e-05 - val_loss: 1.7813e-04
Epoch 8/30
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - loss: 5.4514e-05 - val_loss: 

<keras.src.callbacks.history.History at 0x7be66edda280>

---
# 3. TimeSHAP
---

### Model entry point

In [24]:
import timeshap
f = lambda x: model.predict(x)

In [25]:
d_train_normalized.head()

Unnamed: 0,cpu_r,load_1,load_5,load_15,mem_u,mem_u_e,disk_r,disk_rb,disk_w,disk_wb,eth1_fi,eth1_f0,eth1_pi,eth1_p0,tcp_tw,tcp_use,seq_id,timestep,tcpu_r
0,0.075269,0.065678,0.070234,0.074332,0.933333,0.274011,0.0,0.134132,0.067808,0.150562,0.121988,0.091978,0.09396,0.074155,0.935405,0.018077,1,1,0.086022
1,0.086022,0.080508,0.075808,0.076655,0.930769,0.274953,0.000122,0.148813,0.071395,0.16449,0.121396,0.094162,0.09616,0.076482,0.996555,0.016026,1,2,0.086022
2,0.075269,0.064619,0.071349,0.074332,0.928205,0.274953,0.000366,0.134826,0.063277,0.151546,0.115384,0.090118,0.09594,0.075746,0.96031,0.011763,1,3,0.086022
3,0.086022,0.048729,0.063545,0.070848,0.928205,0.27307,0.000244,0.131281,0.067841,0.145566,0.135121,0.097381,0.101383,0.080182,0.96153,0.016058,1,4,0.086022
4,0.086022,0.051907,0.06243,0.070848,0.933333,0.274011,0.000244,0.10269,0.075654,0.118353,0.127359,0.094495,0.098166,0.077517,0.942152,0.018397,1,5,0.086022


### Baseline event

In [26]:
from timeshap.utils import calc_avg_event
average_event = calc_avg_event(d_train_normalized, numerical_feats=model_features, categorical_feats=[])

In [27]:
average_event

Unnamed: 0,cpu_r,load_1,load_5,load_15,mem_u,mem_u_e,disk_r,disk_rb,disk_w,disk_wb,eth1_fi,eth1_f0,eth1_pi,eth1_p0,tcp_tw,tcp_use
0,0.075269,0.065678,0.068004,0.066202,0.907692,0.266478,0.0,0.139642,0.05184,0.155217,0.102188,0.082109,0.089769,0.070933,0.909208,0.011603


### Baseline event

In [28]:
from timeshap.utils import calc_avg_sequence
average_sequence = calc_avg_sequence(d_train_normalized, numerical_feats=model_features, categorical_feats=[],model_features=model_features, entity_col=sequence_id_feat)

In [29]:
print(average_sequence)
print(average_sequence.shape)

[[0.075269  0.065678  0.068004  0.066202  0.907692  0.266478  0.
  0.139604  0.051847  0.15518   0.102188  0.082109  0.0897685 0.070933
  0.909137  0.011603 ]
 [0.075269  0.065678  0.068004  0.066202  0.907692  0.266478  0.
  0.139604  0.051847  0.15518   0.102188  0.082109  0.0897685 0.070933
  0.909137  0.011603 ]
 [0.075269  0.065678  0.068004  0.066202  0.907692  0.266478  0.
  0.139604  0.051847  0.15518   0.102188  0.082109  0.0897685 0.070933
  0.909137  0.011603 ]
 [0.075269  0.065678  0.068004  0.066202  0.907692  0.266478  0.
  0.139604  0.051847  0.1551985 0.102188  0.082109  0.0897685 0.070933
  0.909137  0.011603 ]
 [0.075269  0.065678  0.068004  0.066202  0.907692  0.266478  0.
  0.139604  0.05184   0.155217  0.102188  0.082109  0.0897685 0.070933
  0.909137  0.011603 ]
 [0.075269  0.065678  0.068004  0.066202  0.907692  0.266478  0.
  0.139604  0.05184   0.155217  0.102188  0.082109  0.0897685 0.070933
  0.909137  0.011603 ]
 [0.075269  0.065678  0.068004  0.066202  0.90

### Average score over baseline

In [30]:
from timeshap.utils import get_avg_score_with_avg_event
avg_score_over_len = get_avg_score_with_avg_event(f, average_event, top=1000)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [31]:
np.mean(list(avg_score_over_len.values()))

0.075563817

## 3.1 Local Explanations

### Select sequences to explain

In [32]:
X_test[0].shape

(24, 16)

In [33]:
data_point = 100

x_data = X_test[data_point].reshape(1,X_test.shape[1],X_test.shape[2])
sequence_id = train_sequence_end+data_point+1

### Local Report on positive instance

In [34]:
from timeshap.explainer import local_report

pruning_dict = {'tol': 0.025}
event_dict = {'rs': 42, 'nsamples': 32000}
feature_dict = {'rs': 42, 'nsamples': 32000, 'feature_names': model_features, 'plot_features': plot_feats}
cell_dict = {'rs': 42, 'nsamples': 32000, 'top_x_feats': 2, 'top_x_events': 2}
local_report(f, x_data, pruning_dict, event_dict, feature_dict, cell_dict=cell_dict, entity_uuid=sequence_id, entity_col='seq_id', baseline=average_event)

Assuming all features are model features
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━

In [35]:
x_pd = d_test_normalized[d_test_normalized['seq_id'] == train_sequence_end+data_point+1]
x_pd

Unnamed: 0,cpu_r,load_1,load_5,load_15,mem_u,mem_u_e,disk_r,disk_rb,disk_w,disk_wb,eth1_fi,eth1_f0,eth1_pi,eth1_p0,tcp_tw,tcp_use,seq_id,timestep,tcpu_r
299520,0.387097,0.439619,0.399108,0.392567,0.946154,0.250471,0.000244,0.183223,0.059523,0.197722,0.306909,0.389833,0.37488,0.328128,0.999928,0.023173,12481,1,0.344086
299521,0.376344,0.335805,0.379041,0.38676,0.946154,0.250471,0.000122,0.184417,0.05826,0.199084,0.300084,0.389596,0.370839,0.326676,0.999928,0.021731,12481,2,0.344086
299522,0.376344,0.288136,0.364548,0.382114,0.946154,0.250471,0.0,0.173474,0.066237,0.189054,0.30017,0.385912,0.36842,0.323991,0.999928,0.020256,12481,3,0.344086
299523,0.419355,0.273305,0.350056,0.376307,0.946154,0.250471,0.000244,0.190698,0.069262,0.203815,0.318146,0.419536,0.418809,0.36992,0.999928,0.023654,12481,4,0.344086
299524,0.419355,0.231992,0.331104,0.369338,0.948718,0.251412,0.000244,0.227651,0.059775,0.242383,0.314598,0.415667,0.40916,0.360711,0.999928,0.023654,12481,5,0.344086
299525,0.408602,0.381356,0.361204,0.377468,0.94359,0.251412,0.000244,0.182298,0.065751,0.198251,0.302104,0.404746,0.385408,0.339864,0.999928,0.021506,12481,6,0.344086
299526,0.419355,0.457627,0.39019,0.387921,0.938462,0.252354,0.000366,0.267378,0.064428,0.2822,0.311653,0.409202,0.3867,0.344837,0.999928,0.025321,12481,7,0.344086
299527,0.419355,0.435381,0.399108,0.391405,0.935897,0.253296,0.0,0.215051,0.07239,0.230006,0.322224,0.418632,0.38002,0.338094,0.999928,0.023365,12481,8,0.344086
299528,0.387097,0.39089,0.403567,0.39489,0.930769,0.253296,0.000122,0.143688,0.059275,0.16063,0.313378,0.401956,0.360695,0.321425,0.999928,0.021314,12481,9,0.344086
299529,0.376344,0.391949,0.408027,0.398374,0.923077,0.253296,0.000122,0.204339,0.053907,0.220204,0.308942,0.389693,0.349232,0.308436,0.999928,0.020833,12481,10,0.344086


## 3.2 Global Explanations

### Explain all 

TimeSHAP offers methods to explain all instances and save as CSV.
This allows for global explanations and local plots with no calculation delay.

In [36]:
d_test_normalized.head()

Unnamed: 0,cpu_r,load_1,load_5,load_15,mem_u,mem_u_e,disk_r,disk_rb,disk_w,disk_wb,eth1_fi,eth1_f0,eth1_pi,eth1_p0,tcp_tw,tcp_use,seq_id,timestep,tcpu_r
297120,0.354839,0.309322,0.319955,0.311266,1.0,0.254237,0.0,0.189234,0.066312,0.203853,0.277327,0.353358,0.357451,0.312056,0.999928,0.021731,12381,1,0.419355
297121,0.354839,0.423729,0.351171,0.32288,1.0,0.255179,0.0,0.183338,0.073583,0.198024,0.288453,0.350945,0.355307,0.307899,0.999928,0.020705,12381,2,0.419355
297122,0.354839,0.412076,0.361204,0.328688,1.0,0.254237,0.0,0.273389,0.068238,0.286515,0.27952,0.349577,0.357561,0.312454,0.999928,0.021506,12381,3,0.419355
297123,0.344086,0.29661,0.341137,0.325203,1.0,0.253296,0.000122,0.165922,0.041451,0.18069,0.274321,0.343173,0.344861,0.299903,0.999928,0.018333,12381,4,0.419355
297124,0.354839,0.300847,0.331104,0.32288,1.0,0.253296,0.0,0.168966,0.076379,0.184172,0.285595,0.347875,0.356104,0.309033,0.999928,0.018526,12381,5,0.419355


In [37]:
len(d_test_normalized)

74280

In [38]:
from timeshap.explainer import global_report

#pos_dataset = d_test_normalized[d_test_normalized[label_feat] > 120]
pos_dataset = d_test_normalized[70000:]
schema = list(pos_dataset.columns)
pruning_dict = {'tol': [0.05, 0.075], 'path': 'outputs/prun_all_tf.csv'}
event_dict = {'path': 'outputs/event_all_tf.csv', 'rs': 42, 'nsamples': 32000}
feature_dict = {'path': 'outputs/feature_all_tf.csv', 'rs': 42, 'nsamples': 32000, 'feature_names': model_features, 'plot_features': plot_feats}


In [39]:
prun_stats, global_plot = global_report(f, pos_dataset, pruning_dict, event_dict, feature_dict, average_event, model_features, schema, sequence_id_feat, time_feat)
prun_stats

Calculating pruning algorithm
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━

AssertionError: Pruning idx must be smaller than the sequence length. If not all events are pruned

In [None]:
global_plot

## 3.3 Individual Plots

### Local Plots

In [None]:
from timeshap.plot import plot_temp_coalition_pruning, plot_event_heatmap, plot_feat_barplot, plot_cell_level
from timeshap.explainer import local_pruning, local_event, local_feat, local_cell_level
# select model features only

x_data = x_pd[model_features]
# convert the instance to numpy so TimeSHAP receives it
x_data = np.expand_dims(x_data.to_numpy().copy(), axis=0)

##### Pruning algorithm

In [None]:
pruning_dict = {'tol': 0.025,}
coal_plot_data, coal_prun_idx = local_pruning(f, x_data, pruning_dict, average_event, positive_sequence_id, sequence_id_feat, False)
# coal_prun_idx is in negative terms
pruning_idx = pos_x_data.shape[1] + coal_prun_idx
pruning_plot = plot_temp_coalition_pruning(coal_plot_data, coal_prun_idx, plot_limit=40)
pruning_plot

##### Event-level explanation

In [None]:
event_dict = {'rs': 42, 'nsamples': 32000}
event_data = local_event(f, pos_x_data, event_dict, positive_sequence_id, sequence_id_feat, average_event, pruning_idx)
event_plot = plot_event_heatmap(event_data)
event_plot

In [None]:
event_plot

##### Feature-level explanation

In [None]:
feature_dict = {'rs': 42, 'nsamples': 32000, 'feature_names': model_features, 'plot_features': plot_feats}
feature_data = local_feat(f, pos_x_data, feature_dict, positive_sequence_id, sequence_id_feat, average_event, pruning_idx)
feature_plot = plot_feat_barplot(feature_data, feature_dict.get('top_feats'), feature_dict.get('plot_features'))
feature_plot

##### Cell-level explanation

In [None]:
cell_dict = {'rs': 42, 'nsamples': 32000, 'top_x_events': 3, 'top_x_feats': 3}
cell_data = local_cell_level(f, pos_x_data, cell_dict, event_data, feature_data, positive_sequence_id, sequence_id_feat, average_event, pruning_idx)
feat_names = list(feature_data['Feature'].values)[:-1] # exclude pruned events
cell_plot = plot_cell_level(cell_data, feat_names, feature_dict.get('plot_features'))
cell_plot

### Global Plots

In [None]:
from timeshap.explainer import prune_all, pruning_statistics, event_explain_all, feat_explain_all
from timeshap.plot import plot_global_event, plot_global_feat

pos_dataset = d_test_normalized[d_test_normalized['label'] == 1]

##### Pruning statistics

In [None]:
pruning_dict = {'tol': [0.05, 0.075], 'path': 'outputs/prun_all_tf.csv'}
prun_indexes = prune_all(f, pos_dataset, pruning_dict, average_event, model_features, schema, sequence_id_feat, time_feat)
pruning_stats = pruning_statistics(prun_indexes, pruning_dict.get('tol'))
pruning_stats

##### Global event-level

In [None]:
event_dict = {'path': 'outputs/event_all_tf.csv', 'rs': 42, 'nsamples': 32000}
event_data = event_explain_all(f, pos_dataset, event_dict, prun_indexes, average_event, model_features, schema, sequence_id_feat, time_feat)
event_global_plot = plot_global_event(event_data)
event_global_plot

##### Global feature-level

In [None]:
feature_dict = {'path': 'outputs/feature_all_tf.csv', 'rs': 42, 'nsamples': 32000, 'feature_names': model_features, 'plot_features': plot_feats, }
feat_data = feat_explain_all(f, pos_dataset, feature_dict, prun_indexes, average_event, model_features, schema, sequence_id_feat, time_feat)
feat_global_plot = plot_global_feat(feat_data, **feature_dict)
feat_global_plot