# This Notebook uses a Session Event Dataset from E-Commerce Website(https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store and https://rees46.com/) to build a Session Based Recommender. An LSTM-based Model will be created and the Metadata will be tracked by MLflow.

In [1]:
#import mlflow
import numpy as np
import os
import shutil
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub
from itertools import product

# enable gpu growth if gpu is available
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

# Enable XLA.
tf.config.optimizer.set_jit(True)

%load_ext watermark
%watermark -v -iv

autopep8         1.5.4
tensorflow       2.4.0
tensorflow_hub   0.9.0
numpy            1.19.5
tensorflow.keras 2.4.0
pandas           1.0.5
json             2.0.9
CPython 3.7.4
IPython 7.8.0


## Setting Registry and Tracking URI for MLflow

In [2]:
# Use this registry uri when mlflow is created by docker container with a mysql db backend
#registry_uri = os.path.expandvars('mysql+pymysql://${MYSQL_USER}:${MYSQL_PASSWORD}@localhost:3306/${MYSQL_DATABASE}')

# Use this registry uri when mlflow is running locally by the command:
# "mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns --host 0.0.0.0"
registry_uri = 'sqlite:///mlflow.db'

tracking_uri = 'http://localhost:5000'

mlflow.tracking.set_registry_uri(registry_uri)
mlflow.tracking.set_tracking_uri(tracking_uri)

# The Data is taken from https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store and https://rees46.com/
## Each record/line in the file has the following fields:
1. event_time: When did the event happened (UTC)
2. event_type: Event type: one of [view, cart, remove_from_cart, purchase] 
3. product_id
4. category_id
5. category_code: Category meaningful name (if present)
6. brand: Brand name in lower case (if present)
7. price
8. user_id: Permanent user ID
9. user_session: User session ID

In [2]:
# Read first 500.000 Rows
for chunk in pd.read_table("2019-Dec.csv",
                           sep=",", header=0,
                           infer_datetime_format=True, low_memory=False, chunksize=500000):
    # Filter out other event types than 'view'
    chunk = chunk[chunk['event_type'] == 'view']
    # Filter out missing 'category_code' rows
    chunk = chunk[chunk['category_code'].isna() == False]
    chunk.reset_index(drop=True, inplace=True)

    # Filter out all Sessions of length 1
    count_sessions = chunk.groupby('user_session').count()
    window_length = count_sessions.max()[0]
    unique_sessions = [count_sessions.index[i] for i in range(
        count_sessions.shape[0]) if count_sessions.iloc[i, 0] == 1]
    chunk = chunk[~chunk['user_session'].isin(unique_sessions)]
    chunk.reset_index(drop=True, inplace=True)

    # Text embedding based on https://tfhub.dev/google/nnlm-en-dim50/2
    last_category = []
    for i, el in enumerate(chunk['category_code']):
        last_category.append(el.split('.')[-1])
    chunk['Product'] = last_category
    embed = hub.load("https://tfhub.dev/google/nnlm-en-dim50/2")
    embeddings = embed(chunk['Product'].tolist())
    for dim in range(embeddings.shape[1]):
        chunk['embedding_'+str(dim)] = embeddings[:, dim]

    # Standardization
    mean = chunk['price'].mean(axis=0)
    print('Mean:', mean)
    std = chunk['price'].std(axis=0)
    print('Std:', std)
    chunk['price_standardized'] = (chunk['price'] - mean) / std

    chunk.sort_values(by=['user_session', 'event_time'], inplace=True)
    chunk['price_standardized'] = chunk['price_standardized'].astype('float32')
    chunk['product_id'] = chunk['product_id'].astype('int32')
    chunk.reset_index(drop=True, inplace=True)

    print('Sessions:', pd.unique(chunk['user_session']).shape)
    print('Unique Products:', pd.unique(chunk['product_id']).shape)
    print('Unique category_code:', pd.unique(chunk['category_code']).shape)

    columns = ['embedding_'+str(i) for i in range(embeddings.shape[1])]
    columns.append('price_standardized')
    columns.append('user_session')
    columns.append('Product')
    columns.append('product_id')
    columns.append('category_code')

    df = chunk[columns]
    break
df

Mean: 284.77105468660056
Std: 349.4674023158121
Sessions: (61296,)
Unique Products: (38515,)
Unique category_code: (134,)


Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_45,embedding_46,embedding_47,embedding_48,embedding_49,price_standardized,user_session,Product,product_id,category_code
0,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.373028,0000afb3-2d30-4b52-84ec-07c6617efd37,light,1004838,construction.tools.light
1,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.373028,0000afb3-2d30-4b52-84ec-07c6617efd37,light,1004838,construction.tools.light
2,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,-0.214412,0.090539,0.104421,0.061444,-0.008996,0.068930,0000b83c-9b26-4881-8bca-e20d460f4194,light,1005252,construction.tools.light
3,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,-0.214412,0.090539,0.104421,0.061444,-0.008996,0.289895,0000b83c-9b26-4881-8bca-e20d460f4194,light,1004503,construction.tools.light
4,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.291904,0000f7c4-8836-4507-82a1-8a10de3fb1b2,light,1005191,construction.tools.light
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369299,0.174397,-0.204014,-0.175919,0.105906,-0.205940,-0.210022,0.224095,0.210598,-0.162613,0.150693,...,0.071817,-0.129293,-0.176801,0.100340,0.119850,-0.630791,fffdfd5e-126c-409f-9c16-8224f22cb60b,cooler,4400467,computers.components.cooler
369300,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.511696,fffe34dd-9537-4991-9f12-d81f1dda91cb,light,1004903,construction.tools.light
369301,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.511696,fffe34dd-9537-4991-9f12-d81f1dda91cb,light,1004903,construction.tools.light
369302,0.047610,-0.125734,-0.053261,0.196848,-0.016433,0.049773,0.012852,0.263229,-0.195415,0.327227,...,-0.214412,0.090539,0.104421,0.061444,-0.008996,-0.459760,fffe34dd-9537-4991-9f12-d81f1dda91cb,light,1004856,construction.tools.light


## Delete Rows with equal or less than 6 Product Occurrences

In [3]:
count_product_id_mapped = df.groupby('product_id').count()
products_to_delete = count_product_id_mapped.loc[count_product_id_mapped['embedding_0'] <= 6].index
products_to_delete

Int64Index([  1000978,   1002102,   1002367,   1002876,   1002877,   1002996,
              1003001,   1003014,   1003048,   1003050,
            ...
            100028638, 100028646, 100028653, 100028659, 100028661, 100028663,
            100028668, 100028717, 100028774, 100028794],
           dtype='int64', name='product_id', length=29774)

## Slice Sessions from the Dataframe

In [4]:
list_sessions = []
list_last_clicked = []
current_id = df.loc[0, 'user_session']
current_index = 0

columns = ['embedding_'+str(i) for i in range(embeddings.shape[1])]
columns.append('price_standardized')
columns.insert(0, 'product_id')

for i in range(df.shape[0]):
    if df.loc[i, 'user_session'] != current_id:
        list_sessions.append(df.loc[current_index:i-2, columns])
        list_last_clicked.append(df.loc[i-1, 'product_id'])
        current_id = df.loc[i, 'user_session']
        current_index = i

## Delete Sessions with Length larger than 30

In [16]:
print(len(list_sessions))
list_sessions_filtered = []
list_last_clicked_filtered = []

for index, session in enumerate(list_sessions):
    if not (session.shape[0] > 30):
        if not (session['product_id'].isin(products_to_delete).any()):
            list_sessions_filtered.append(session)
            list_last_clicked_filtered.append(list_last_clicked[index])
            
len(list_sessions_filtered)

61295


44551

## Slice Sessions if label and last product from session is the same
Example:
- From: session: [ 1506  1506 11410 11410  2826  2826], ground truth: 2826
- To: session: [ 1506  1506 11410 11410], ground truth: 2826

In [6]:
print("Length before", len(list_sessions_filtered))
list_sessions_processed = []
list_last_clicked_processed = []

for i, session in enumerate(list_sessions_filtered):
    if session['product_id'].values[-1] == list_last_clicked_filtered[i]:
        mask = session['product_id'].values == list_last_clicked_filtered[i]
        if session[~mask].shape[0] > 0:
            list_sessions_processed.append(session[~mask])
            list_last_clicked_processed.append(list_last_clicked_filtered[i])
    else:
        list_sessions_processed.append(session)
        list_last_clicked_processed.append(list_last_clicked_filtered[i])

print("Length after", len(list_sessions_processed))

Length before 44551
Length after 30941


## Create Item IDs starting from value 1 for Embeddings and One Hot Layer

In [7]:
products = pd.DataFrame()

for index, session in enumerate(list_sessions_processed):
    products = pd.concat([products, session['product_id']], ignore_index=True)

products = pd.concat([products, pd.DataFrame(
    list_last_clicked_processed)], ignore_index=True)

unique_items = pd.unique(products[0])

print('Number of unique Items:', unique_items.shape[0])
dict_items = dict(
    zip(unique_items, [i+1 for i in range(unique_items.shape[0])]))

for index, session in enumerate(list_sessions_processed):
    session['product_id'] = session['product_id'].map(dict_items)

list_last_clicked_processed = pd.DataFrame(list_last_clicked_processed)[
    0].map(dict_items).tolist()

d = pd.DataFrame.from_records(data=list(dict_items.items()), columns=[
    'Item_ID', 'Mapped_ID'])
# map product_id to category_code
d['category_code'] = [df[df['product_id'] == i]
                      ['category_code'].values[0] for i in d['Item_ID']]
d.to_csv('ID_Mapping.csv')

Number of unique Items: 9494


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [24]:
# Pad all Sessions with 0. Embedding Layer and LSTM will use Masking to ignore zeros.
list_sessions_padded = []
window_length = 30

for df in list_sessions_processed:
    np_array = df.values
    result = np.zeros((window_length, np_array.shape[1]), dtype=np.float32)

    result[:np_array.shape[0], :np_array.shape[1]] = np_array
    list_sessions_padded.append(result)


# Save the results, because the slicing can take some time
np.save('list_sessions_padded.npy', list_sessions_padded)
np.save('list_last_clicked.npy', list_last_clicked_processed)

sessions_padded = np.array(list_sessions_padded)
last_clicked = np.array(list_last_clicked_processed)

n_output_features = int(last_clicked.max())
n_unique_input_ids = int(sessions_padded[:, :, 0].max())
window_length = sessions_padded.shape[1]
n_input_features = sessions_padded.shape[2]
print("n_output_features", n_output_features)
print("n_unique_input_ids", n_unique_input_ids)
print("window_length", window_length)
print("n_input_features", n_input_features)

n_output_features 9494
n_unique_input_ids 7030
window_length 30
n_input_features 52


# Training: Start here if the preprocessing was already executed

In [26]:
sessions_padded = np.load('list_sessions_padded.npy')
print(sessions_padded.shape)
last_clicked = np.load('list_last_clicked.npy')
print(last_clicked.shape)
n_output_features = int(last_clicked.max())
n_unique_input_ids = int(sessions_padded[:, :, 0].max())
window_length = sessions_padded.shape[1]
n_input_features = sessions_padded.shape[2]

(30941, 30, 52)
(30941,)


## Grid Search Hyperparameter
Dictionary with different hyperparameters to train on.
MLflow will track those in a database.

In [27]:
grid_search_dic = {'hidden_layer_size': [5],
                   'batch_size': [32],
                   'embedding_dim': [200],
                   'window_length': [window_length],
                   'dropout_fc': [0.0],  # 0.2
                   'n_output_features': [n_output_features],
                   'n_input_features': [n_input_features]}

# Cartesian product
grid_search_param = [dict(zip(grid_search_dic, v))
                     for v in product(*grid_search_dic.values())]
grid_search_param

[{'hidden_layer_size': 5,
  'batch_size': 32,
  'embedding_dim': 200,
  'window_length': 30,
  'dropout_fc': 0.0,
  'n_output_features': 9494,
  'n_input_features': 52}]

### LSTM Model in functional API
2 layer LSTM model for predicting the next clicked product.
- Input: x rows (time steps) of Item IDs in a Session. Shape: (batches, window_length, features)
- Output: Prediction of the next Clicked Item. Shape: (batches,)

In [28]:
def build_lstm_2_layer_model(window_length=50, embedding_dim=200, n_input_features=5, n_unique_input_ids=0,
                             n_output_features=3, units_lstm_layer=30, dropout_rate=0.2):
    """Builds 2 Layer LSTM-based TF Model in functional API.
    Args:
        window_length: Input Data as Numpy Array, Shape (rows, n_features)
        embedding_dim: Number Dimensions of the Embedding Layer.
        n_output_features: Number (Classes) of Items.
        units_lstm_layer: Number of Neurons for the LSTM Layers.
        dropout_rate: Dropout Rate for the last Fully Connected Dense Layer.
    Returns:
        keras.models.Model
    """
    inputs = keras.layers.Input(
        shape=[window_length, n_input_features], dtype=np.float32)

    # Embedding Layer
    embedding_layer = tf.keras.layers.Embedding(
        n_unique_input_ids+1, embedding_dim, input_length=window_length, mask_zero=True)
    embeddings = embedding_layer(inputs[:, :, 0])

    concat = tf.concat([embeddings, inputs[:, :, 1:]], axis=2)

    # LSTM Layer 1
    lstm1_output, lstm1_state_h, lstm1_state_c = keras.layers.LSTM(units=units_lstm_layer, return_state=True,
                                                                   return_sequences=True)(concat)
    lstm1_state = [lstm1_state_h, lstm1_state_c]

    # LSTM Layer 2
    lstm2_output, lstm2_state_h, lstm2_state_c = keras.layers.LSTM(units=units_lstm_layer, return_state=True,
                                                                   return_sequences=True)(lstm1_output,
                                                                                          initial_state=lstm1_state)

    reshaped = tf.reshape(lstm2_output,
                          [-1, window_length * units_lstm_layer])
    #concat = tf.concat([lstm2_state_h, lstm2_state_c], axis=1)

    # Dropout
    dropout = tf.keras.layers.Dropout(dropout_rate)(reshaped)

    fc_layer = keras.layers.Dense(n_output_features+1, kernel_initializer='he_normal', dtype=tf.float32)(
        dropout)

    softmax = tf.keras.layers.Softmax(axis=1)(fc_layer)

    model = keras.models.Model(inputs=[inputs],
                               outputs=[softmax])
    return model

### Convert Numpy Array to tf.data.Dataset for better training performance
The function will return a zipped tf.data.Dataset with the following Shapes:
- x: (batches, window_length, features)
- y: (batches,)

In [29]:
def array_to_tf_data_api(train_data_x, train_data_y, batch_size=64, window_length=50,
                         validate=False):
    """Applies sliding window on the fly by using the TF Data API.
    Args:
      train_data_x: Input Data as Numpy Array, Shape (rows, n_features)
      batch_size: Batch Size.
      window_length: Window Length or Window Size.
      future_length: Number of time steps that will be predicted in the future.
      n_output_features: Number of features that will be predicted.
      validate: True if input data is a validation set and does not need to be shuffled
      shift: Shifts the Sliding Window by this Parameter.
    Returns:
      tf.data.Dataset
    """

    X = tf.data.Dataset.from_tensor_slices(train_data_x)
    y = tf.data.Dataset.from_tensor_slices(train_data_y)

    if not validate:
        train_tf_data = tf.data.Dataset.zip((X, y)).cache() \
            .shuffle(buffer_size=200000, reshuffle_each_iteration=True)\
            .batch(batch_size).prefetch(1)
        return train_tf_data
    else:
        return tf.data.Dataset.zip((X, y)).batch(batch_size)\
            .prefetch(1)

## Custom TF Callback to log Metrics by MLflow

In [11]:
class MlflowLogging(tf.keras.callbacks.Callback):
    def __init__(self, **kwargs):
        super().__init__()  # handles base args (e.g., dtype)

    def on_epoch_end(self, epoch, logs=None):
        keys = list(logs.keys())
        for key in keys:
            mlflow.log_metric(str(key), logs.get(key), step=epoch)

# Training

In [12]:
with mlflow.start_run() as parent_run:
    for params in grid_search_param:
        batch_size = params['batch_size']
        window_length = params['window_length']
        embedding_dim = params['embedding_dim']
        dropout_fc = params['dropout_fc']
        hidden_layer_size = params['hidden_layer_size']
        n_output_features = params['n_output_features']
        n_input_features = params['n_input_features']

        with mlflow.start_run(nested=True) as child_run:
            # log parameter
            mlflow.log_param('batch_size', batch_size)
            mlflow.log_param('window_length', window_length)
            mlflow.log_param('hidden_layer_size', hidden_layer_size)
            mlflow.log_param('dropout_fc_layer', dropout_fc)
            mlflow.log_param('embedding_dim', embedding_dim)
            mlflow.log_param('n_output_features', n_output_features)
            mlflow.log_param('n_unique_input_ids', n_unique_input_ids)
            mlflow.log_param('n_input_features', n_input_features)

            model = build_lstm_2_layer_model(window_length=window_length,
                                             n_output_features=n_output_features,
                                             n_unique_input_ids=n_unique_input_ids,
                                             n_input_features=n_input_features,
                                             embedding_dim=embedding_dim,
                                             units_lstm_layer=hidden_layer_size,
                                             dropout_rate=dropout_fc)

            data = array_to_tf_data_api(sessions_padded,
                                        last_clicked,
                                        window_length=window_length,
                                        batch_size=batch_size)

            model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                          optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
                          metrics=['accuracy'])

            model.fit(data, shuffle=True, initial_epoch=0, epochs=50,
                      callbacks=[MlflowLogging()])

            model.save("./tmp")

            mlflow.tensorflow.log_model(tf_saved_model_dir='./tmp',
                                        tf_meta_graph_tags='serve',
                                        tf_signature_def_key='serving_default',
                                        artifact_path='saved_model',
                                        registered_model_name='Session Based LSTM Recommender')

            shutil.rmtree("./tmp")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




INFO:tensorflow:Assets written to: ./tmp/assets


INFO:tensorflow:Assets written to: ./tmp/assets
2021/01/15 15:34:05 INFO mlflow.tensorflow: Validating the specified TensorFlow model by attempting to load it in a new TensorFlow graph...
2021/01/15 15:34:08 INFO mlflow.tensorflow: Validation succeeded!
Successfully registered model 'Session Based LSTM Recommender'.
2021/01/15 15:34:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Session Based LSTM Recommender, version 1
Created version '1' of model 'Session Based LSTM Recommender'.
