In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from alibi_detect.cd import KSDrift, MMDDrift
from alibi_detect.cd.preprocess import UAE
from alibi_detect.models.embedding import TransformerEmbedding
from alibi_detect.utils.saving import save_detector, load_detector

# Disable GPU
tf.config.experimental.set_visible_devices([], 'GPU')

%load_ext watermark
%watermark -v -iv

tensorflow       2.4.0
tensorflow.keras 2.4.0
autopep8         1.5.4
json             2.0.9
numpy            1.19.5
CPython 3.7.4
IPython 7.8.0


# Load preprocessed Data from Outlier Detection Component

In [2]:
sessions_padded = np.load('../outlier_detection/list_sessions_padded_autoencoder.npy')
print(sessions_padded.shape)
n_output_features = int(sessions_padded.max())
n_unique_input_ids = int(sessions_padded.max())
window_length = sessions_padded.shape[1]
n_input_features = sessions_padded.shape[2]

(45916, 207, 1)


### LSTM Autoencoder in functional API
- Input: x rows (time steps) of Item IDs in a Session
- Output: reconstructed Session

In [3]:
def build_autoencoder(window_length=50,
                      units_lstm_layer=100,
                      n_unique_input_ids=0,
                      embedding_dim=200,
                      n_input_features=1,
                      n_output_features=3,
                      dropout_rate=0.1):

    inputs = keras.layers.Input(
        shape=[window_length, n_input_features], dtype=np.float32)

    # Encoder
    # Embedding Layer
    embedding_layer = tf.keras.layers.Embedding(
        n_unique_input_ids+1, embedding_dim, input_length=window_length)  # , mask_zero=True)
    embeddings = embedding_layer(inputs[:, :, 0])

    mask = inputs[:, :, 0] != 0

    # LSTM Layer 1
    lstm1_output, lstm1_state_h, lstm1_state_c = keras.layers.LSTM(units=units_lstm_layer, return_state=True,
                                                                   return_sequences=True)(embeddings, mask=mask)
    lstm1_state = [lstm1_state_h, lstm1_state_c]

    # Decoder
    # input: lstm1_state_c, lstm1_state_h
    decoder_state_c = lstm1_state_c
    decoder_state_h = lstm1_state_h
    decoder_outputs = tf.expand_dims(lstm1_state_h, 1)

    list_states = []
    decoder_layer = keras.layers.LSTM(
        units=units_lstm_layer, return_state=True, return_sequences=True, unroll=False)
    for i in range(window_length):
        decoder_outputs, decoder_state_h, decoder_state_c = decoder_layer(decoder_outputs,
                                                                          initial_state=[decoder_state_h,
                                                                                         decoder_state_c])
        list_states.append(decoder_state_h)
    stacked = tf.stack(list_states, axis=1)

    fc_layer = tf.keras.layers.Dense(
        n_output_features+1, kernel_initializer='he_normal', dtype=tf.float32)

    fc_layer_output = tf.keras.layers.TimeDistributed(fc_layer)(
        stacked, mask=mask)

    mask_softmax = tf.tile(tf.expand_dims(mask, axis=2),
                           [1, 1, n_output_features+1])

    softmax = tf.keras.layers.Softmax(axis=2)(
        fc_layer_output, mask=mask_softmax)

    model = keras.models.Model(inputs=[inputs],
                               outputs=[softmax])
    return model

# Concept Drift

In [4]:
model = build_autoencoder(window_length=window_length,
                                             n_output_features=n_output_features,
                                             n_unique_input_ids=n_unique_input_ids,
                                             n_input_features=n_input_features,
                                             embedding_dim=200,
                                             units_lstm_layer=500,
                                             dropout_rate=0.0)
model.load_weights("../outlier_detection/weights")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2b715f50108>

In [5]:
def build_embedding(window_length=50,
                          n_unique_input_ids=0,
                          embedding_dim=200):
    
    inputs = keras.layers.Input(
        shape=[window_length], dtype=np.float32)
    
    # Embedding Layer
    embedding_layer = tf.keras.layers.Embedding(
        n_unique_input_ids+1, embedding_dim, input_length=window_length)
    embeddings = embedding_layer(inputs)
    
    model = keras.models.Model(inputs=[inputs],
                               outputs=[embeddings])
    return model

### Load Embedding Layer from the Autoencoder

In [6]:
Embedding = build_embedding(window_length=window_length, n_unique_input_ids=n_unique_input_ids, embedding_dim=200)
Embedding.layers[1].set_weights(model.layers[3].embeddings.numpy()[np.newaxis])

emb = Embedding(sessions_padded[:5,:])
print(emb.shape)

(5, 207, 200)


In [7]:
shape = tuple(emb.shape[1:])
enc_dim = 32
uae = UAE(input_layer=Embedding, shape=shape, enc_dim=enc_dim)

In [8]:
emb_uae = uae(sessions_padded[:5,:])
print(emb_uae.shape)

(5, 32)


In [9]:
# define preprocessing step parameters
preprocess_kwargs = {
    'model': uae,
    'batch_size': 32
}

cd = KSDrift(
    p_val=.05,
    X_ref=sessions_padded,  # reference data to test against
    preprocess_X_ref=True,  # store preprocessed X_ref for future predict calls
    preprocess_kwargs=preprocess_kwargs
)

In [10]:
filepath = './model'
save_detector(cd, filepath)

Directory ./model does not exist and is now created.


In [11]:
cd = load_detector('./model')



# Test Concept Drift with some Outliers

In [12]:
mask = [  805,   850,  1410,  2065,  2463,  2728,  3525,  5037,  5906,
         5959,  5994,  6639,  6757,  7137,  7613,  7637,  7931,  8398,
         9452, 10298, 11209, 11436, 11574, 11735, 11755, 11865, 12416,
        12735, 13061, 13148, 13352, 13369, 13544, 13642, 14233, 14356,
        15380, 15739, 16186, 16409, 16581, 17275, 17350, 17767, 17960,
        18187, 18660, 19081, 19269, 19434, 19523, 19675, 20483, 20539,
        20595, 20604, 21410, 21713, 22416, 22695, 22991, 23023, 23994,
        24052, 24197, 24208, 24325, 24815, 24871, 25030, 25057, 25168,
        25280, 25825, 26030, 26401, 26616, 27054, 27153, 27426, 27679,
        30020, 30425, 30791, 30847, 30876, 31115, 31220, 32672, 32730,
        32753, 33017, 33074, 33152, 33183, 33850, 34458, 35150, 35686,
        36259, 36321, 37746, 37820, 37972, 38047, 38311, 38568, 38627,
        38833, 38974, 39236, 39910, 40448, 40558, 40634, 40979, 41124,
        41239, 41592, 41968, 42151, 42347, 42727, 42959, 43899, 44101,
        44613, 45188]
preds_ood = cd.predict(sessions_padded[mask], return_p_val=True)
print(preds_ood)

{'data': {'is_drift': 1, 'distance': array([0.368352  , 0.37975392, 0.41268158, 0.36084983, 0.36416024,
       0.40482894, 0.37353265, 0.20875213, 0.4468486 , 0.30124167,
       0.3088915 , 0.38088983, 0.2658605 , 0.30052364, 0.2464582 ,
       0.30943325, 0.45181417, 0.27487695, 0.22882342, 0.3953136 ,
       0.35303393, 0.41076162, 0.2986847 , 0.25553048, 0.29826817,
       0.43733734, 0.47010234, 0.44312507, 0.34793565, 0.3328524 ,
       0.35619256, 0.3159547 ], dtype=float32), 'p_val': array([1.8104736e-15, 2.0515612e-16, 2.6250678e-19, 7.3167350e-15,
       3.9647977e-15, 1.3517198e-18, 6.7868425e-16, 2.9483872e-05,
       1.4559551e-22, 1.7375296e-10, 5.2778108e-11, 1.6454715e-16,
       2.9141837e-08, 1.9401585e-10, 3.6862471e-07, 4.8452339e-11,
       4.6601658e-23, 8.3939096e-09, 3.1321515e-06, 9.4408654e-18,
       3.0404265e-14, 3.9302737e-19, 2.5704383e-10, 1.1525539e-07,
       2.7388886e-10, 1.2460515e-21, 6.2966769e-25, 3.3927592e-22,
       7.5710820e-14, 1.0414281e-12

# Test Concept Drift with normal data

In [13]:
preds_ood = cd.predict(sessions_padded[:100], return_p_val=True)
preds_ood

{'data': {'is_drift': 0,
  'distance': array([0.10480443, 0.08111247, 0.07187386, 0.08680809, 0.07404129,
         0.05961843, 0.05158376, 0.0735639 , 0.06709905, 0.09586114,
         0.04634114, 0.0570372 , 0.08720359, 0.10140082, 0.06235909,
         0.08118738, 0.07082412, 0.10679328, 0.07208642, 0.09112292,
         0.07191916, 0.10646659, 0.04683335, 0.09178151, 0.12237477,
         0.06620698, 0.08474606, 0.05472515, 0.0566295 , 0.0677228 ,
         0.09936493, 0.07004704], dtype=float32),
  'p_val': array([0.22307067, 0.5275708 , 0.6811714 , 0.43966496, 0.6446806 ,
         0.87013763, 0.9533246 , 0.6527243 , 0.759998  , 0.3182817 ,
         0.9828912 , 0.90162665, 0.4338617 , 0.2564216 , 0.8325692 ,
         0.52636766, 0.69876456, 0.2051651 , 0.67759955, 0.3787502 ,
         0.68041044, 0.20802791, 0.98090535, 0.3699333 , 0.1007021 ,
         0.7742269 , 0.4705918 , 0.926135  , 0.90621316, 0.74992853,
         0.2780507 , 0.71171975], dtype=float32),
  'threshold': 0.0015625},