# This Notebook uses a Session Event Dataset from E-Commerce Website (https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store and https://rees46.com/) to build an Outlier Detection based on an Autoencoder.

In [1]:
import mlflow
import numpy as np
import os
import shutil
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub
from itertools import product

# enable gpu growth if gpu is available
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)
    
%load_ext watermark
%watermark -v -iv

json             2.0.9
tensorflow.keras 2.4.0
tensorflow       2.4.0
autopep8         1.5.4
numpy            1.19.5
mlflow           1.13.1
pandas           1.0.5
tensorflow_hub   0.9.0
CPython 3.7.4
IPython 7.8.0


## Setting Registry and Tracking URI for MLflow

In [2]:
# Use this registry uri when mlflow is created by docker container with a mysql db backend
#registry_uri = os.path.expandvars('mysql+pymysql://${MYSQL_USER}:${MYSQL_PASSWORD}@localhost:3306/${MYSQL_DATABASE}')

# Use this registry uri when mlflow is running locally by the command:
# "mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns --host 0.0.0.0"
registry_uri = 'sqlite:///mlflow.db'

tracking_uri = 'http://localhost:5000'

mlflow.tracking.set_registry_uri(registry_uri)
mlflow.tracking.set_tracking_uri(tracking_uri)

## yoochoose-clicks.dat - Click events. Each record/line in the file has the following fields:
1. Session ID – the id of the session. In one session there are one or many clicks.
2. Timestamp – the time when the click occurred.
3. Item ID – the unique identifier of the item.
4. Category – the category of the item.

In [3]:
for chunk in pd.read_table("2019-Dec.csv",
                           sep=",", header=0,
                           infer_datetime_format=True, low_memory=False, chunksize=500000):
    # Filter out other event types than 'view'
    chunk = chunk[chunk['event_type'] == 'view']
    # Filter out missing 'category_code' rows
    chunk = chunk[chunk['category_code'].isna() == False]
    chunk.reset_index(drop=True, inplace=True)

    # Filter out all Sessions of length 1
    count_sessions = chunk.groupby('user_session').count()
    window_length = count_sessions.max()[0]
    unique_sessions = [count_sessions.index[i] for i in range(
        count_sessions.shape[0]) if count_sessions.iloc[i, 0] == 1]
    chunk = chunk[chunk['user_session'].isin(unique_sessions) == False]
    chunk.reset_index(drop=True, inplace=True)
    
    # Text embedding based on https://tfhub.dev/google/nnlm-en-dim50/2
    last_category = []
    for i, el in enumerate(chunk['category_code']):
        last_category.append(el.split('.')[-1])
    chunk['Product'] = last_category
    embed = hub.load("https://tfhub.dev/google/nnlm-en-dim50/2")
    embeddings = embed(chunk['Product'].tolist())
    for dim in range(embeddings.shape[1]):
        chunk['embedding_'+str(dim)] = embeddings[:, dim]        
     
    # Create Item IDs starting from value 1 for Embeddings and One Hot Layer
    unique_items = pd.unique(chunk['product_id'])
    print('Number of unique Items:', unique_items.shape[0])
    dict_items = dict(zip(unique_items, [i+1 for i in range(unique_items.shape[0])]))
    chunk['product_id_mapped'] = chunk['product_id'].map(dict_items)
    d = pd.DataFrame.from_records(data=list(dict_items.items()), columns=['Item_ID', 'Mapped_ID'])
    # map product_id to category_code
    d['category_code'] = [chunk[chunk['product_id'] == i]['category_code'].values[0] for i in d['Item_ID']]
    d.to_csv('ID_Mapping.csv')

    # Standardization
    mean = chunk['price'].mean(axis=0)
    print('Mean:', mean)
    std = chunk['price'].std(axis=0)
    print('Std:', std)
    chunk['price_standardized'] = (chunk['price'] - mean) / std

    
    chunk.sort_values(by=['user_session', 'event_time'], inplace=True)
    chunk['price_standardized'] = chunk['price_standardized'].astype('float32')
    chunk['product_id_mapped'] = chunk['product_id_mapped'].astype('int32')
    chunk.reset_index(drop=True, inplace=True)
    
    
    print('Sessions:', pd.unique(chunk['user_session']).shape)
    print('Unique Products:',pd.unique(chunk['product_id']).shape)
    print('Unique category_code:',pd.unique(chunk['category_code']).shape)
    
    columns = ['embedding_'+str(i) for i in range(embeddings.shape[1])]
    columns.append('price_standardized')
    columns.append('user_session')
    columns.append('Product')
    columns.append('product_id_mapped')
    
    df = chunk[columns]
    break
df

Number of unique Items: 38515
Mean: 284.77105468660056
Std: 349.4674023158121
Sessions: (61296,)
Unique Products: (38515,)
Unique category_code: (134,)


Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_44,embedding_45,embedding_46,embedding_47,embedding_48,embedding_49,price_standardized,user_session,Product,product_id_mapped
0,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,0.106797,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.373028,0000afb3-2d30-4b52-84ec-07c6617efd37,light,27
1,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,0.106797,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.373028,0000afb3-2d30-4b52-84ec-07c6617efd37,light,27
2,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,0.106797,-0.214412,0.090539,0.104421,0.061444,-0.008996,0.068930,0000b83c-9b26-4881-8bca-e20d460f4194,light,887
3,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,0.106797,-0.214412,0.090539,0.104421,0.061444,-0.008996,0.289895,0000b83c-9b26-4881-8bca-e20d460f4194,light,605
4,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,0.106797,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.291904,0000f7c4-8836-4507-82a1-8a10de3fb1b2,light,1506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369299,0.174397,-0.204014,-0.175919,0.105906,-0.205940,-0.210022,0.224095,0.210598,-0.162613,0.150693,...,0.084346,0.071817,-0.129293,-0.176801,0.100340,0.119850,-0.630791,fffdfd5e-126c-409f-9c16-8224f22cb60b,cooler,7953
369300,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,0.106797,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.511696,fffe34dd-9537-4991-9f12-d81f1dda91cb,light,844
369301,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,0.106797,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.511696,fffe34dd-9537-4991-9f12-d81f1dda91cb,light,844
369302,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,0.106797,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.459760,fffe34dd-9537-4991-9f12-d81f1dda91cb,light,8


## Slice Sessions from the Dataframe

In [4]:
list_sessions = []
list_last_clicked = []
list_last_clicked_temp = []
current_id = df.loc[0, 'user_session']
current_index = 0

columns = ['embedding_'+str(i) for i in range(embeddings.shape[1])]
columns.append('price_standardized')
columns.insert(0, 'product_id_mapped')

for i in range(df.shape[0]):
    if df.loc[i, 'user_session'] != current_id:
        list_sessions.append(df.loc[current_index:i-2, columns])
        list_last_clicked.append(df.loc[i-1, 'product_id_mapped'])
        list_last_clicked_temp.append(df.loc[i-1, columns])
        current_id = df.loc[i, 'user_session']
        current_index = i

## Slice Sessions if label and last product from session is the same
Example:
- From: session: [ 1506  1506 11410 11410  2826  2826], ground truth: 2826
- To: session: [ 1506  1506 11410 11410], ground truth: 2826

In [5]:
print("Length before", len(list_sessions))
list_sessions_processed = []
list_last_clicked_processed = []
list_session_processed_autoencoder = []

for i, session in enumerate(list_sessions):
    if session['product_id_mapped'].values[-1] == list_last_clicked[i]:
        mask = session['product_id_mapped'].values == list_last_clicked[i]
        if session[~mask].shape[0] > 0:
            list_sessions_processed.append(session[~mask])
            list_last_clicked_processed.append(list_last_clicked[i])
            list_session_processed_autoencoder.append(np.concatenate(
                [session[~mask].values, list_last_clicked_temp[i].values[np.newaxis, :]], axis=0))
    else:
        list_sessions_processed.append(session)
        list_last_clicked_processed.append(list_last_clicked[i])
        list_session_processed_autoencoder.append(np.concatenate(
                [session.values, list_last_clicked_temp[i].values[np.newaxis, :]], axis=0))

print("Length after", len(list_sessions_processed))

Length before 61295
Length after 45916


In [6]:
# Pad all Sessions with 0. Embedding Layer and LSTM will use Masking to ignore zeros.
list_sessions_padded = []

for np_array in list_session_processed_autoencoder:
    result = np.zeros((window_length, 1), dtype=np.float32)

    result[:np_array.shape[0],:1] = np_array[:,:1]
    list_sessions_padded.append(result)


# Save the results, because the slicing can take some time
np.save('list_sessions_padded_autoencoder.npy', list_sessions_padded)

sessions_padded = np.array(list_sessions_padded)

n_output_features = int(sessions_padded.max())
n_unique_input_ids = int(sessions_padded.max())
window_length = sessions_padded.shape[1]
n_input_features = sessions_padded.shape[2]
print("n_output_features", n_output_features)
print("n_unique_input_ids", n_unique_input_ids)
print("window_length", window_length)
print("n_input_features", n_input_features)

n_output_features 38515
n_unique_input_ids 38515
window_length 207
n_input_features 1


# Training: Start here if the preprocessing was already executed

In [7]:
sessions_padded = np.load('list_sessions_padded_autoencoder.npy')
print(sessions_padded.shape)
n_output_features = int(sessions_padded.max())
n_unique_input_ids = int(sessions_padded.max())
window_length = sessions_padded.shape[1]
n_input_features = sessions_padded.shape[2]

(45916, 207, 1)


## Grid Search Hyperparameter
Dictionary with different hyperparameters to train on.
MLflow will track those in a database.

In [8]:
grid_search_dic = {'hidden_layer_size': [500],
                   'batch_size': [10],
                   'embedding_dim': [200],
                   'window_length': [window_length],
                   'dropout_fc': [0.0], #0.2
                   'n_output_features': [n_output_features],
                   'n_input_features': [n_input_features]}

# Cartesian product
grid_search_param = [dict(zip(grid_search_dic, v)) for v in product(*grid_search_dic.values())]
grid_search_param

[{'hidden_layer_size': 500,
  'batch_size': 10,
  'embedding_dim': 200,
  'window_length': 207,
  'dropout_fc': 0.0,
  'n_output_features': 38515,
  'n_input_features': 1}]

### LSTM Autoencoder in functional API
- Input: x rows (time steps) of Item IDs in a Session
- Output: reconstructed Session

In [9]:
def build_autoencoder(window_length=50,
                      units_lstm_layer=100,
                      n_unique_input_ids=0,
                      embedding_dim=200,
                      n_input_features=1,
                      n_output_features=3,
                      dropout_rate=0.1):

    inputs = keras.layers.Input(
        shape=[window_length, n_input_features], dtype=np.float32)

    # Encoder
    # Embedding Layer
    embedding_layer = tf.keras.layers.Embedding(
        n_unique_input_ids+1, embedding_dim, input_length=window_length)  # , mask_zero=True)
    embeddings = embedding_layer(inputs[:, :, 0])

    mask = inputs[:, :, 0] != 0

    # LSTM Layer 1
    lstm1_output, lstm1_state_h, lstm1_state_c = keras.layers.LSTM(units=units_lstm_layer, return_state=True,
                                                                   return_sequences=True)(embeddings, mask=mask)
    lstm1_state = [lstm1_state_h, lstm1_state_c]

    # Decoder
    # input: lstm1_state_c, lstm1_state_h
    decoder_state_c = lstm1_state_c
    decoder_state_h = lstm1_state_h
    decoder_outputs = tf.expand_dims(lstm1_state_h, 1)

    list_states = []
    decoder_layer = keras.layers.LSTM(
        units=units_lstm_layer, return_state=True, return_sequences=True, unroll=False)
    for i in range(window_length):
        decoder_outputs, decoder_state_h, decoder_state_c = decoder_layer(decoder_outputs,
                                                                          initial_state=[decoder_state_h,
                                                                                         decoder_state_c])
        list_states.append(decoder_state_h)
    stacked = tf.stack(list_states, axis=1)

    fc_layer = tf.keras.layers.Dense(
        n_output_features+1, kernel_initializer='he_normal', dtype=tf.float32)

    fc_layer_output = tf.keras.layers.TimeDistributed(fc_layer)(
        stacked, mask=mask)

    mask_softmax = tf.tile(tf.expand_dims(mask, axis=2),
                           [1, 1, n_output_features+1])

    softmax = tf.keras.layers.Softmax(axis=2)(
        fc_layer_output, mask=mask_softmax)

    model = keras.models.Model(inputs=[inputs],
                               outputs=[softmax])
    return model

### Convert Numpy Array to tf.data.Dataset for better training performance
The function will return a zipped tf.data.Dataset with the following Shapes:
- x: (batches, window_length)
- y: (batches,)

In [10]:
def array_to_tf_data_api(train_data_x, train_data_y, batch_size=64, window_length=50,
                         validate=False):
    """Applies sliding window on the fly by using the TF Data API.
    Args:
      train_data_x: Input Data as Numpy Array, Shape (rows, n_features)
      batch_size: Batch Size.
      window_length: Window Length or Window Size.
      future_length: Number of time steps that will be predicted in the future.
      n_output_features: Number of features that will be predicted.
      validate: True if input data is a validation set and does not need to be shuffled
      shift: Shifts the Sliding Window by this Parameter.
    Returns:
      tf.data.Dataset
    """

    X = tf.data.Dataset.from_tensor_slices(train_data_x)
    y = tf.data.Dataset.from_tensor_slices(train_data_y)

    if not validate:
        train_tf_data = tf.data.Dataset.zip((X, y)).cache() \
            .shuffle(buffer_size=200000, reshuffle_each_iteration=True)\
            .batch(batch_size).prefetch(1)
        return train_tf_data
    else:
        return tf.data.Dataset.zip((X, y)).batch(batch_size)\
            .prefetch(1)

## Custom TF Callback to log Metrics by MLflow

In [11]:
class MlflowLogging(tf.keras.callbacks.Callback):
    def __init__(self, **kwargs):
        super().__init__()  # handles base args (e.g., dtype)

    def on_epoch_end(self, epoch, logs=None):
        keys = list(logs.keys())
        for key in keys:
            mlflow.log_metric(str(key), logs.get(key), step=epoch)

In [12]:
class CustomCategoricalCrossentropy(keras.losses.Loss):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.bce = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction='sum')

    @tf.function
    def call(self, y_true, y_pred):
        total = 0.0
        for i in tf.range(y_pred.shape[1]):
            loss = self.bce(y_true[:, i, 0], y_pred[:, i, :])
            total = total + loss
        return total

    def get_config(self):
        base_config = super().get_config()
        return {**base_config}

    def from_config(cls, config):
        return cls(**config)

In [13]:
class CategoricalAccuracy(keras.metrics.Metric):
    def __init__(self, name="categorical_accuracy", **kwargs):
        super(CategoricalAccuracy, self).__init__(name=name, **kwargs)
        self.true = self.add_weight(name="true", initializer="zeros")
        self.count = self.add_weight(name="count", initializer="zeros")
        self.accuracy = self.add_weight(name="count", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, "float32")
        y_pred = tf.cast(y_pred, "float32")

        mask = y_true[:, :, 0] != 0
        argmax = tf.cast(tf.argmax(y_pred, axis=2), "float32")
        temp = argmax == y_true[:, :, 0]
        true = tf.reduce_sum(tf.cast(temp[mask], dtype=tf.float32))
        self.true.assign_add(true)
        self.count.assign_add(
            tf.cast(tf.shape(temp[mask])[0], dtype="float32"))

        self.accuracy.assign(tf.math.divide(self.true, self.count))

    def result(self):
        return self.accuracy

    def reset_states(self):
        # The state of the metric will be reset at the start of each epoch.
        self.accuracy.assign(0.0)


class CategoricalSessionAccuracy(keras.metrics.Metric):
    def __init__(self, name="categorical_session_accuracy", **kwargs):
        super(CategoricalSessionAccuracy, self).__init__(name=name, **kwargs)
        self.true = self.add_weight(name="true", initializer="zeros")
        self.count = self.add_weight(name="count", initializer="zeros")
        self.accuracy = self.add_weight(name="count", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, "float32")
        y_pred = tf.cast(y_pred, "float32")

        mask = y_true[:, :, 0] != 0
        argmax = tf.cast(tf.argmax(y_pred, axis=2), "float32")
        temp = argmax == y_true[:, :, 0]
        temp = tf.reduce_all(temp, axis=1)
        true = tf.reduce_sum(tf.cast(temp, dtype=tf.float32))
        self.true.assign_add(true)
        self.count.assign_add(tf.cast(tf.shape(temp)[0], dtype="float32"))

        self.accuracy.assign(tf.math.divide(self.true, self.count))

    def result(self):
        return self.accuracy

    def reset_states(self):
        # The state of the metric will be reset at the start of each epoch.
        self.accuracy.assign(0.0)

# Training

In [14]:
with mlflow.start_run() as parent_run:
    for params in grid_search_param:
        batch_size = params['batch_size']
        window_length = params['window_length']
        embedding_dim = params['embedding_dim']
        dropout_fc = params['dropout_fc']
        hidden_layer_size = params['hidden_layer_size']
        n_output_features = params['n_output_features']
        n_input_features = params['n_input_features']

        with mlflow.start_run(nested=True) as child_run:
            # log parameter
            mlflow.log_param('batch_size', batch_size)
            mlflow.log_param('window_length', window_length)
            mlflow.log_param('hidden_layer_size', hidden_layer_size)
            mlflow.log_param('dropout_fc_layer', dropout_fc)
            mlflow.log_param('embedding_dim', embedding_dim)
            mlflow.log_param('n_output_features', n_output_features)
            mlflow.log_param('n_unique_input_ids', n_unique_input_ids)
            mlflow.log_param('n_input_features', n_input_features)

            model = build_autoencoder(window_length=window_length,
                                             n_output_features=n_output_features,
                                             n_unique_input_ids=n_unique_input_ids,
                                             n_input_features=n_input_features,
                                             embedding_dim=embedding_dim,
                                             units_lstm_layer=hidden_layer_size,
                                             dropout_rate=dropout_fc)

            data = array_to_tf_data_api(sessions_padded,
                                        sessions_padded,
                                        window_length=window_length,
                                        batch_size=batch_size)

            model.compile(loss=CustomCategoricalCrossentropy(),#tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='sum'),
                          optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
                          metrics=[CategoricalAccuracy(), CategoricalSessionAccuracy()])

            model.fit(data, shuffle=True, initial_epoch=0, epochs=2,
                      callbacks=[MlflowLogging()])
            
            model.compile()
            model.save("./tmp")
            model.save_weights('weights')

            mlflow.tensorflow.log_model(tf_saved_model_dir='./tmp',
                                        tf_meta_graph_tags='serve',
                                        tf_signature_def_key='serving_default',
                                        artifact_path='saved_model',
                                        registered_model_name='Session Based LSTM Recommender')

            shutil.rmtree("./tmp")

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4




INFO:tensorflow:Assets written to: ./tmp/assets


INFO:tensorflow:Assets written to: ./tmp/assets
2021/02/07 17:03:26 INFO mlflow.tensorflow: Validating the specified TensorFlow model by attempting to load it in a new TensorFlow graph...
2021/02/07 17:04:11 INFO mlflow.tensorflow: Validation succeeded!
Registered model 'Autoencoder for Session Based LSTM Recommender' already exists. Creating a new version of this model...
2021/02/07 17:04:12 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Autoencoder for Session Based LSTM Recommender, version 8
Created version '8' of model 'Autoencoder for Session Based LSTM Recommender'.
