In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from alibi_detect.cd import KSDrift, MMDDrift
from alibi_detect.cd.preprocess import UAE
from alibi_detect.models.embedding import TransformerEmbedding
from alibi_detect.utils.saving import save_detector, load_detector

# enable gpu growth if gpu is available
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

# tf.keras.mixed_precision.set_global_policy('mixed_float16')

tf.config.optimizer.set_jit(True)

%load_ext watermark
%watermark -v -iv

autopep8         1.5.4
numpy            1.19.5
json             2.0.9
tensorflow.keras 2.4.0
tensorflow       2.4.0
CPython 3.7.4
IPython 7.8.0


# Load preprocessed Data from Outlier Detection Component

In [2]:
sessions_padded = np.load('../outlier_detection/list_sessions_padded_autoencoder.npy')
print(sessions_padded.shape)
n_output_features = int(sessions_padded.max())
n_unique_input_ids = int(sessions_padded.max())
window_length = sessions_padded.shape[1]
n_input_features = sessions_padded.shape[2]

(30941, 31, 1)


### LSTM Autoencoder in functional API
- Input: x rows (time steps) of Item IDs in a Session
- Output: reconstructed Session

In [3]:
def build_autoencoder(window_length=50,
                      units_lstm_layer=100,
                      n_unique_input_ids=0,
                      embedding_dim=200,
                      n_input_features=1,
                      n_output_features=3,
                      dropout_rate=0.1):

    inputs = keras.layers.Input(
        shape=[window_length, n_input_features], dtype=np.float32)

    # Encoder
    # Embedding Layer
    embedding_layer = tf.keras.layers.Embedding(
        n_unique_input_ids+1, embedding_dim, input_length=window_length)  # , mask_zero=True)
    embeddings = embedding_layer(inputs[:, :, 0])

    mask = inputs[:, :, 0] != 0

    # LSTM Layer 1
    lstm1_output, lstm1_state_h, lstm1_state_c = keras.layers.LSTM(units=units_lstm_layer, return_state=True,
                                                                   return_sequences=True)(embeddings, mask=mask)
    lstm1_state = [lstm1_state_h, lstm1_state_c]

    # Decoder
    # input: lstm1_state_c, lstm1_state_h
    decoder_state_c = lstm1_state_c
    decoder_state_h = lstm1_state_h
    decoder_outputs = tf.expand_dims(lstm1_state_h, 1)

    list_states = []
    decoder_layer = keras.layers.LSTM(
        units=units_lstm_layer, return_state=True, return_sequences=True, unroll=False)
    for i in range(window_length):
        decoder_outputs, decoder_state_h, decoder_state_c = decoder_layer(decoder_outputs,
                                                                          initial_state=[decoder_state_h,
                                                                                         decoder_state_c])
        list_states.append(decoder_state_h)
    stacked = tf.stack(list_states, axis=1)

    fc_layer = tf.keras.layers.Dense(
        n_output_features+1, kernel_initializer='he_normal')

    fc_layer_output = tf.keras.layers.TimeDistributed(fc_layer)(
        stacked, mask=mask)

    mask_softmax = tf.tile(tf.expand_dims(mask, axis=2),
                           [1, 1, n_output_features+1])

    softmax = tf.keras.layers.Softmax(axis=2, dtype=tf.float32)(
        fc_layer_output, mask=mask_softmax)

    model = keras.models.Model(inputs=[inputs],
                               outputs=[softmax])
    return model

# Concept Drift

In [4]:
model = build_autoencoder(window_length=window_length,
                                             n_output_features=n_output_features,
                                             n_unique_input_ids=n_unique_input_ids,
                                             n_input_features=n_input_features,
                                             embedding_dim=200,
                                             units_lstm_layer=300,
                                             dropout_rate=0.0)
model.load_weights("../outlier_detection/weights")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2c6352d3248>

In [5]:
def build_embedding(window_length=50,
                          n_unique_input_ids=0,
                          embedding_dim=200):
    
    inputs = keras.layers.Input(
        shape=[window_length], dtype=np.float32)
    
    # Embedding Layer
    embedding_layer = tf.keras.layers.Embedding(
        n_unique_input_ids+1, embedding_dim, input_length=window_length)
    embeddings = embedding_layer(inputs)
    
    model = keras.models.Model(inputs=[inputs],
                               outputs=[embeddings])
    return model

### Load Embedding Layer from the Autoencoder

In [6]:
Embedding = build_embedding(window_length=window_length, n_unique_input_ids=n_unique_input_ids, embedding_dim=200)
Embedding.layers[1].set_weights(model.layers[3].embeddings.numpy()[np.newaxis])

emb = Embedding(sessions_padded[:5,:])
print(emb.shape)

(5, 31, 200)


In [7]:
shape = tuple(emb.shape[1:])
enc_dim = 32
uae = UAE(input_layer=Embedding, shape=shape, enc_dim=enc_dim)

In [8]:
emb_uae = uae(sessions_padded[:5,:])
print(emb_uae.shape)

(5, 32)


In [9]:
# define preprocessing step parameters
preprocess_kwargs = {
    'model': uae,
    'batch_size': 32
}

cd = KSDrift(
    p_val=.05,
    X_ref=sessions_padded,  # reference data to test against
    preprocess_X_ref=True,  # store preprocessed X_ref for future predict calls
    preprocess_kwargs=preprocess_kwargs
)

In [10]:
filepath = './model'
save_detector(cd, filepath)

Directory ./model does not exist and is now created.


In [11]:
cd = load_detector('./model')



# Test Concept Drift with some Outliers

In [15]:
mask = [  169,   246,   394,   498,   630,  1039,  1578,  2008,  2040,
         2447,  2557,  2609,  3179,  3276,  3481,  3615,  3813,  4179,
         4361,  4794,  5077,  6184,  6369,  7347,  7596,  8415,  8761,
         8773,  9011,  9404,  9504,  9613,  9880,  9907,  9978, 10050,
        10229, 10573, 10654, 11196, 11429, 11477, 11493, 11654, 11975,
        12135, 13526, 13659, 13729, 14139, 14469, 14910, 15203, 15429,
        15934, 15982, 16310, 16352, 16504, 16647, 16743, 17046, 17085,
        17302, 17342, 17449, 18584, 18702, 18711, 18770, 19204, 19642,
        19758, 19863, 19891, 20135, 20244, 20652, 20865, 20899, 21077,
        21680, 23338, 23407, 23892, 24101, 24257, 24259, 24396, 25078,
        25127, 25380, 25576, 26071, 26082, 26123, 26323, 26373, 27007,
        27629, 27664, 27833, 28388, 28739, 29576, 29588, 30381, 30529,
        30873, 30930]
preds_ood = cd.predict(sessions_padded[mask], return_p_val=True)
print(preds_ood)

{'data': {'is_drift': 1, 'distance': array([0.29177997, 0.34911987, 0.23460251, 0.26946124, 0.2595077 ,
       0.4610946 , 0.36399922, 0.21240984, 0.45049787, 0.3319159 ,
       0.29141855, 0.39273134, 0.32614478, 0.4137896 , 0.22888283,
       0.21889608, 0.36555555, 0.62413096, 0.2220916 , 0.36459008,
       0.21113116, 0.4130718 , 0.5817236 , 0.27963278, 0.3472668 ,
       0.34736785, 0.24555385, 0.25989464, 0.2602046 , 0.35898882,
       0.28831676, 0.21523486], dtype=float32), 'p_val': array([1.5688768e-08, 4.9755152e-12, 1.1508869e-05, 2.4443361e-07,
       7.7523498e-07, 1.1465326e-20, 4.8599086e-13, 1.0127854e-04,
       9.5294496e-20, 6.4912270e-11, 1.6430652e-08, 4.1362677e-15,
       1.4924340e-10, 9.9915193e-17, 2.0578967e-05, 5.4849472e-05,
       3.7890286e-13, 1.6383755e-37, 4.0272535e-05, 4.4222400e-13,
       1.1404563e-04, 1.1379877e-16, 1.2105791e-32, 7.1848909e-08,
       6.6023489e-12, 6.5015094e-12, 3.6341589e-06, 7.4181969e-07,
       7.1606007e-07, 1.0752519e-12

# Test Concept Drift with normal data

In [16]:
preds_ood = cd.predict(sessions_padded[:100], return_p_val=True)
preds_ood

{'data': {'is_drift': 0,
  'distance': array([0.12389968, 0.05854917, 0.09096441, 0.13766782, 0.08862545,
         0.08763421, 0.12121715, 0.07145632, 0.07594777, 0.10657057,
         0.093422  , 0.09063864, 0.08938367, 0.08744029, 0.09906564,
         0.15516078, 0.09016095, 0.09892796, 0.11167997, 0.07256908,
         0.10378947, 0.09502311, 0.11840858, 0.09213923, 0.08747099,
         0.12627614, 0.11722504, 0.11798746, 0.06320578, 0.12884684,
         0.07871077, 0.10619663], dtype=float32),
  'p_val': array([0.0937343 , 0.88405776, 0.38153896, 0.04572367, 0.41401944,
         0.42826006, 0.10685763, 0.6888091 , 0.6132912 , 0.20760556,
         0.34917364, 0.3859664 , 0.4033153 , 0.4310781 , 0.28191262,
         0.01646833, 0.39251527, 0.28343564, 0.16631426, 0.67012227,
         0.2331772 , 0.32908514, 0.12219147, 0.36583787, 0.43063122,
         0.08326028, 0.12917197, 0.124639  , 0.8207168 , 0.07305764,
         0.5673453 , 0.2109131 ], dtype=float32),
  'threshold': 0.0015625},