## Imports

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras import layers, callbacks, optimizers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.preprocessing import sequence

import shap

import warnings
warnings.filterwarnings("ignore")

## Read Data

In [2]:
### Read Train csv

imputed_train = pd.read_csv('data_timeshap_train.csv', index_col=False)
imputed_train

Unnamed: 0,Label,Sequence,Timestamp,feature_1,feature_2,feature_3
0,0.0,p000001,5.0,83.14,37.143754,245.289318
1,0.0,p000001,6.0,83.14,37.143754,245.289318
2,0.0,p000001,7.0,83.14,36.110000,245.289318
3,0.0,p000001,8.0,83.14,37.143754,245.289318
4,0.0,p000001,9.0,83.14,37.143754,245.289318
...,...,...,...,...,...,...
5345,0.0,p000134,28.0,64.60,37.143754,164.000000
5346,0.0,p000134,29.0,64.60,36.670000,164.000000
5347,0.0,p000134,30.0,64.60,37.143754,164.000000
5348,0.0,p000134,31.0,64.60,37.143754,164.000000


In [3]:
### Read Test csv

imputed_test = pd.read_csv('data_timeshap_test.csv', index_col=False)
imputed_test

# Columns
#imputed_test.columns

Unnamed: 0,Label,Sequence,Timestamp,feature_1,feature_2,feature_3
0,0.0,p000032,4.0,82.32,37.143754,245.289318
1,0.0,p000032,5.0,82.32,36.720000,245.289318
2,0.0,p000032,6.0,82.32,36.810000,245.289318
3,0.0,p000032,7.0,82.32,37.143754,245.289318
4,0.0,p000032,8.0,82.32,37.143754,245.289318
...,...,...,...,...,...,...
1345,0.0,p000128,20.0,73.09,37.143754,467.000000
1346,0.0,p000128,21.0,73.09,36.670000,467.000000
1347,0.0,p000128,22.0,73.09,37.143754,467.000000
1348,0.0,p000128,23.0,73.09,37.143754,467.000000


In [4]:
### Define Feature list

features = imputed_train.columns[[3,4,5]].tolist()
features

['feature_1', 'feature_2', 'feature_3']

## Transofmation Functions

In [5]:
# Define a lookback
lookback = 50

#### Transformation adapted to my use case - Shap doesn't work

In [6]:
# Error: Input 0 of layer "forward_lstm" is incompatible with the layer: expected 
# shape=(None, None, 4), found shape=(None, 1, 3)

In [7]:
# ##### DESIRED TRANSFORATION #####

# # Create a df without the 'Sequences' column
# dl_train = imputed_train[['Timestamp', 'feature_1', 'feature_2','feature_3','Label']].copy()

# # Create a df without the 'Sequences' column
# dl_test = imputed_test[['Timestamp', 'feature_1', 'feature_2','feature_3','Label']].copy()

# def split_sequences(sequences, n_steps):
#     X, y = list(), list()
#     for i in range(len(sequences)):
#         # find the end of this pattern
#         end_ix = i + n_steps
#         # check if we are beyond the dataset
#         if end_ix > len(sequences):
#             break
#         # gather input and output parts of the pattern
#         seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
#         X.append(seq_x)
#         y.append(seq_y)
#     return np.array(X), np.array(y)


# # Transform both train & test dataframes to numpy arrays because the split_sequences functions takes a numpy array as an input
# X_train_reshaped = dl_train.to_numpy()
# X_test_reshaped = dl_test.to_numpy()

# print('Shape before transformation, train:', X_train_reshaped.shape)
# print('Shape before transformation, test:', X_train_reshaped.shape)

# # Create the LSTM input sequence in the shape (n_rows, lookback, n_features) from the dataframes
# X_train_seq, Y_train_seq = split_sequences(X_train_reshaped, lookback)
# X_test_seq, Y_test_seq = split_sequences(X_test_reshaped, lookback)

# # Check the input sequence shape for the train & test datasets
# print('A sequence training data shape:', X_train_seq.shape, Y_train_seq.shape)
# print('A sequence test data shape:', X_test_seq.shape, Y_test_seq.shape)

#### Transformation Function taken from the AReM Tutorial - Shap works

In [8]:
# ##### USING THE FUNCTION FROM THE TUTORIAL #####

# All features are taken 
dl_train = imputed_train

# All features are taken 
dl_test = imputed_test


def df_to_numpy(df, model_feats, label_feat, group_by_feat, timestamp_Feat):
    
    sequence_length = 50

    data_tensor = np.zeros(
    (len(df[group_by_feat].unique()), sequence_length, len(model_feats)))
    
    labels_tensor = np.zeros((len(df[group_by_feat].unique()), ))

    for i, name in enumerate(df[group_by_feat].unique()):

        name_data = df[df[group_by_feat] == name]
        sorted_data = name_data.sort_values(timestamp_Feat)
        
        data_x = sorted_data[model_feats].values
        labels = sorted_data[label_feat].values

        data_tensor[i, :, :] = data_x
        labels_tensor[i, ] = np.max(labels)
    
    return data_tensor, labels_tensor


X_train_seq, Y_train_seq = df_to_numpy(dl_train, features, 'Label', 'Sequence', 'Timestamp')
X_test_seq, Y_test_seq = df_to_numpy(dl_test, features, 'Label', 'Sequence', 'Timestamp')

print('Shape before transformation, train:', dl_train.shape)
print('Shape before transformation, test:', dl_test.shape)

# Check the input sequence shape for the train & test datasets
print('A sequence training data shape:', X_train_seq.shape, Y_train_seq.shape)
print('A sequence test data shape:', X_test_seq.shape, Y_test_seq.shape)

Shape before transformation, train: (5350, 6)
Shape before transformation, test: (1350, 6)
A sequence training data shape: (107, 50, 3) (107,)
A sequence test data shape: (27, 50, 3) (27,)


## Define LSTM 

In [9]:
# Set the random seed into the tensorflow backend to re-create results
seed = 42
tf.random.set_seed(seed)

# Set the number of features to be used as an input
n_features = len(features)
batch_size = lookback
print('Training with batch_size & lookback =', str(batch_size))

Training with batch_size & lookback = 50


In [10]:
# Ensure that the previous sessions in the tensorflow backend are cleared
tf.keras.backend.clear_session()

# Define the model architecture
model = Sequential()

# Layer 1
model.add(Bidirectional(LSTM(26, activation='tanh', return_sequences=True, input_shape=(lookback, n_features))))

# Layer 2 
model.add(Bidirectional(LSTM(12, activation='tanh')))

# Layer 3 - Prediction layer
model.add(Dense(1, activation='sigmoid'))


##Compile the LSTM model & define the optimization function

# Define the model optimization function to the Adam optimizer and set the learning rate
opt = tf.keras.optimizers.Adam(learning_rate=0.01)

# Define the early stopping function to stop the model before it starts overfitting
early_stopping = callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.001, restore_best_weights=True)

# Compile the model using the "binary_crossentropy" loss function
model.compile(
    optimizer=opt,
    loss='binary_crossentropy',
    metrics=['binary_accuracy', keras.metrics.Precision(), keras.metrics.Recall()]
)

## Train the model

In [11]:
history = model.fit(
    X_train_seq, Y_train_seq,
    validation_data=(X_test_seq, Y_test_seq),
    shuffle=False,
    batch_size=batch_size,
    epochs=400,
    callbacks=[early_stopping],
    verbose=1)

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 12: early stopping


In [12]:
# Evaluation

# history_df = pd.DataFrame(history.history)
# print(('Best Validation Loss: {}'.format(round(history_df['val_loss'].min(),2))))
# print(('Best Validation Accuracy: {}'.format(round(history_df['val_binary_accuracy'].max(),2))))
# print(('Best Validation Precision: {}'.format(round(history_df['val_precision'].max(),2))))
# print(('Best Validation Recall: {}'.format(round(history_df['val_recall'].max(),2))))

## SHAP Values

###### Define Naming

In [13]:
## Define variables' the naming

model_features = features 
label = 'Label'
sequence_id_feat = 'Sequence' 
time_feat = 'Timestamp'

In [14]:
model_features

['feature_1', 'feature_2', 'feature_3']

###### What Sequence to be explained

In [15]:
## For our local explanations, we will be explaining the sequence defined as 'what_to_explain'

what_to_explain = 'p000009'
positive_sequence_id = what_to_explain

pos_x_data = imputed_train[imputed_train['Sequence']==what_to_explain]
pos_x_data = pos_x_data[model_features]

pos_x_data.head(10)

Unnamed: 0,feature_1,feature_2,feature_3
50,27.92,37.94,448.0
51,27.92,37.143754,448.0
52,27.92,37.143754,448.0
53,27.92,38.78,448.0
54,27.92,37.143754,448.0
55,27.92,37.143754,448.0
56,27.92,39.0,448.0
57,27.92,37.143754,448.0
58,27.92,37.143754,448.0
59,27.92,38.33,448.0


##### Model entry point

In [16]:
f = lambda x: model.predict(x)

###### Baseline event

In [17]:
from timeshap.utils import calc_avg_event
average_event = calc_avg_event(dl_train, numerical_feats=model_features, categorical_feats=[])

In [18]:
average_event

Unnamed: 0,feature_1,feature_2,feature_3
0,51.14,37.143754,158.0


###### Average score over baseline

In [19]:
from timeshap.utils import get_avg_score_with_avg_event
avg_score_over_len = get_avg_score_with_avg_event(f, average_event, top=50)

### Local Explanations

In [20]:
from timeshap.plot import plot_temp_coalition_pruning, plot_event_heatmap, plot_feat_barplot, plot_cell_level
from timeshap.explainer import local_pruning, local_event, local_feat, local_cell_level

# convert the instance to numpy so TimeSHAP receives it
pos_x_data = np.expand_dims(pos_x_data.to_numpy().copy(), axis=0)
pos_x_data.shape

(1, 50, 3)

###### Pruning Alrogithm

In [21]:
pruning_dict = {'tol': 0.025}
coal_plot_data, coal_prun_idx = local_pruning(f, pos_x_data, pruning_dict, average_event, positive_sequence_id, False)
# coal_prun_idx is in negative terms
pruning_idx = pos_x_data.shape[1] + coal_prun_idx
pruning_plot = plot_temp_coalition_pruning(coal_plot_data, coal_prun_idx) #, plot_limit=40)
pruning_plot

No path to explainer data provided. Calculating data


##### Event-level explanation

In [22]:
event_dict = {'rs': 42, 'nsamples': 32000}
event_data = local_event(f, pos_x_data, event_dict, positive_sequence_id, sequence_id_feat, average_event, pruning_idx)
event_plot = plot_event_heatmap(event_data)
event_plot

No path to event data provided. Calculating data


##### Feature-level explanation

In [23]:
feature_dict = {'rs': 42, 'nsamples': 32000, 'feature_names': model_features}
feature_data = local_feat(f, pos_x_data, feature_dict, positive_sequence_id, sequence_id_feat, average_event, pruning_idx)
feature_plot = plot_feat_barplot(feature_data, feature_dict.get('top_feats'), feature_dict.get('plot_features'))
feature_plot

No path to feature data provided. Calculating data


In [24]:
# Feature-level explanation - Table view
feature_data.sort_values('Shapley Value')

Unnamed: 0,Random seed,NSamples,Feature,Shapley Value
1,42,32000,feature_2,2.483527e-09
0,42,32000,feature_1,0.0006464149
2,42,32000,feature_3,0.0205456
3,42,32000,Pruned Events,0.02244925
