# **GRU4Rec**

In order to train the GRU4Rec model, a dataset is needed with the following columns: **session**, **timestamp**, **item**. Each row corresponds to a single interaction. The dataset is sorted by session and timestamp to facilitate session retrieval.

We use the SessionDataset class to store the training data plus some ready-to-use information, like session offsets (= indices where each session starts in the dataset) and item-to-integer mappings. We also define a function to extract a single specific session.

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

class SessionDataset:

      def __init__(self, df):

          self.df = df.sort_values(by = ['session', 'timestamp']).reset_index(drop = True) # session (int) | timestamp (int) | item (string)
          self.offsets    = np.concatenate((np.zeros(1, dtype = np.int32), self.df.groupby('session').size().cumsum().values)) # indices in df where the sessions start
          self.n_sessions = len(self.offsets) - 1

          self.item_to_id = {item : i for i, item in enumerate(self.df.item.unique())}

          self.n_items = len(self.item_to_id)

      def item_to_one_hot(self, item):

          return tf.one_hot(self.item_to_id[item], depth = self.n_items)

      def extract_session(self, i, one_hot_encoded = True):

          session = self.df[self.offsets[i]:self.offsets[i+1]].copy()
          if one_hot_encoded:
              session.loc[:, 'item'] = session.item.apply(lambda x : self.item_to_one_hot(x))
          return session.item.values.tolist()


## Loss functions: TOP1 and BPR

In [2]:
# y_true = (BATCH_SIZE, n_classes)   one-hot representations of the target items (ground truths)
# y_pred = (BATCH_SIZE, n_classes)   model output = next item scores (logits) for each item in the batch

sampling = False

if sampling: # = the negative items considered in the loss computation are those within the same batch
    
    def BPR(y_true, y_pred):
        to_lookup = tf.argmax(y_true, axis = 1)   # = indices of the target items
        scores = tf.nn.embedding_lookup(tf.transpose(y_pred), to_lookup)  # embedding_lookup is the same as "extract_rows". In this way, the positive items end up on the diagonal
        return tf.reduce_mean(-tf.math.log(tf.nn.sigmoid(tf.linalg.diag_part(scores) - scores)))

    def TOP1(y_true, y_pred):
        to_lookup = tf.argmax(y_true, axis = 1)
        scores = tf.nn.embedding_lookup(tf.transpose(y_pred), to_lookup)
        diag_scores = tf.linalg.diag_part(scores)
        loss_by_sample  = tf.reduce_mean(tf.nn.sigmoid(scores - diag_scores) + tf.nn.sigmoid(tf.square(scores)), axis = 0)
        loss_by_sample -= tf.nn.sigmoid(tf.square(diag_scores)) / tf.reduce_sum(tf.ones_like(diag_scores)) # only sigmoids of squares of negative items had to be added: remove those of positive items
        return tf.reduce_mean(loss_by_sample)

else: # = consider all negative items in the loss computation (only makes sense if the number of items is small, like the same order as the batch size)

    def BPR(y_true, y_pred):  # both inputs have shape (BATCH_SIZE, n_classes)
        _y_pred = tf.expand_dims(y_pred, axis = -1)  # (BATCH_SIZE, n_classes, 1) 
        mat = tf.matmul(tf.expand_dims(tf.ones_like(y_true), -1), tf.expand_dims(y_true, axis = 1)) # (BATCH_SIZE, n_classes, 1) x (BATCH_SIZE, 1, n_classes) = (BATCH_SIZE, n_classes, n_classes)
        score_diffs = tf.matmul(mat, _y_pred) # (BATCH_SIZE, n_classes, n_classes) x (BATCH_SIZE, n_classes, 1) = (BATCH_SIZE, n_classes, 1)
        score_diffs = tf.squeeze(score_diffs - _y_pred, -1) # (BATCH_SIZE, n_classes)
        return -tf.reduce_sum(tf.math.log(tf.nn.sigmoid(score_diffs)))

    def TOP1(y_true, y_pred):
        _y_pred = tf.expand_dims(y_pred, axis = -1)  # (BATCH_SIZE, n_classes) ---> (BATCH_SIZE, n_classes, 1) 
        mat = tf.matmul(tf.expand_dims(tf.ones_like(y_true), -1), tf.expand_dims(y_true, axis = 1)) # (BATCH_SIZE, n_classes, 1) x (BATCH_SIZE, 1, n_classes) --> (BATCH_SIZE, n_classes, n_classes)
        score_diffs = tf.matmul(mat, _y_pred) # (BATCH_SIZE, n_classes, n_classes) x (BATCH_SIZE, n_classes, 1) --> (BATCH_SIZE, n_classes, 1)
        score_diffs = tf.squeeze(score_diffs - _y_pred, -1) # (BATCH_SIZE, n_classes)
        loss_by_sample = tf.reduce_sum(tf.nn.sigmoid(tf.square(y_pred)), axis = -1) + \
                          tf.reduce_sum(tf.sigmoid(-score_diffs), axis = -1) + \
                        -tf.squeeze(tf.squeeze(tf.nn.sigmoid(tf.square(tf.matmul(tf.expand_dims(y_true, 1), _y_pred))), -1), -1)
        return tf.reduce_sum(loss_by_sample)

The Gru4Rec class is used to instantiate the model and train it on a SessionDataset object.

In [3]:
class Gru4Rec:

    def __init__(self, n_classes, n_layers = 1, n_hidden = 64, loss = TOP1, batch_size = 8):

        self.n_classes  = n_classes   # = number of items

        self.n_layers = n_layers  # number of stacked GRU layers
        self.n_hidden = n_hidden  # dimension of GRU cell's hidden state
        self.loss     = loss
        self.batch_size = batch_size

        self.model = self.build_model()

    def build_model(self):

        model = tf.keras.models.Sequential()
        for i in range(self.n_layers):
            model.add(tf.keras.layers.GRU(name = 'GRU_{}'.format(i+1),
                                          units      = self.n_hidden, 
                                          activation = 'relu', 
                                          stateful   = True,
                                          return_sequences = (i < self.n_layers - 1)))
        model.add(tf.keras.layers.Dense(units = self.n_classes, activation = 'linear'))   # class logits

        # track top 3 accuracy (= how often the true item is among the top 3 recommended)
        top3accuracy = lambda y_true, y_pred: tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k = 3)
        top3accuracy.__name__ = 'top3accuracy'
        model.compile(loss = self.loss, optimizer = 'adam', metrics = ['accuracy', top3accuracy])

        model.build(input_shape = (self.batch_size, 1, self.n_classes))
        print(model.summary())

        return model

    def _reset_hidden(self, i):

        for nl, layer in enumerate(self.model.layers):   # session has changed: reset corresponding hidden state
            if self._is_GRU_layer(layer) and layer.states[0] is not None:
                hidden_updated = layer.states[0].numpy()
                hidden_updated[i, :] = 0.
                self.model.layers[nl].reset_states(hidden_updated)

    def _is_GRU_layer(self, layer):

        return layer.name.startswith('GRU_')

    def train_batch_generator(self, dataset):  # session | item | timestamp
        # generates batches of training data X, y = session item, next session item

        assert dataset.n_sessions > self.batch_size, "Training set is too small. Reduce batch size or collect more training data"
        ixs = np.arange(dataset.n_sessions)

        stacks = [[]] * self.batch_size   # stacks containing batch_size REVERSED (pieces of) sessions at once. Will be emptied progressively
        next_session_id = 0

        X, y = np.empty(shape = (self.batch_size, 1, self.n_classes)), np.empty(shape = (self.batch_size, self.n_classes))    
        while True:
            X[:], y[:] = None, None
            for i in range(self.batch_size): # fill in X, y (current batch)
                # 1. If stack i is empty (only happens at first round) or has only one element: fill it with a new session
                if len(stacks[i]) <= 1:
                    if next_session_id >= dataset.n_sessions: # no more sessions available: shuffle sessions and restart
                        np.random.shuffle(ixs)
                        next_session_id = 0
                    while not len(stacks[i]) >= 2:   # ignore sessions with only one element (cannot contribute to the training)
                        stacks[i] = dataset.extract_session(ixs[next_session_id])[::-1]  # the data does not have to be all in memory at the same time: we could e.g. load a session at once
                        next_session_id += 1
                    self._reset_hidden(i)   # if session changes, the corresponding hidden state must be reset
                # 2. Stack i is now valid: set input + target variables
                X[i, 0] = stacks[i].pop()
                y[i]    = stacks[i][-1]

            yield tf.constant(X, dtype = tf.float32), tf.constant(y, dtype = tf.float32)

    def fit(self, dataset, steps_per_epoch = 10000, epochs = 5):

        checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = "gru-chkpt-{epoch:02d}.hdf5")
        self.model.fit_generator(generator       = self.train_batch_generator(dataset), 
                                 steps_per_epoch = steps_per_epoch, 
                                 epochs          = epochs,
                                 callbacks       = [checkpoint], 
                                 shuffle         = False)

## Test the model

We test the model on a simple workout recommendation task.

We used the workout records dataset available at https://sites.google.com/eng.ucsd.edu/fitrec-project/home (see quoted paper in the Readme). We extracted the plain list of workouts in time and performed some cleaning (removal of one-item sessions and duplicate or overlapping sessions). The result is a dataset of workout sequences for slightly more than 1000 users. We treat each sequence (i.e., each user) as a distinct session. The goal is to predict the next user workout given all the previous ones. There is a high variance in session length (from 2 to about 200). The number of workouts (or classes) is only 48, so we use no sampling in the losses.

The dataset is split into a training and test set. The test set contains the last two items in each session, the training set contains all the sessions with the last item removed.

In [8]:
import pandas as pd
df = pd.read_csv("workouts_clean_2.csv").sort_values(by = ['session', 'timestamp']).reset_index(drop = True)
offsets = np.concatenate((np.zeros(1, dtype = np.int32), df.groupby('session').size().cumsum().values))

dataset_train = SessionDataset(df.iloc[~df.index.isin(offsets[1:] - 1)])  # training set: remove last element from each session

# Test set: x = penultimate item in each session, y = last item in each session
X_test = df.iloc[offsets[1:] - 2][['session', 'item']].sort_values(by = ['session']).reset_index(drop = True)
y_test = df.iloc[offsets[1:] - 1][['session', 'item']].sort_values(by = ['session']).reset_index(drop = True)

print("X_test")
print(X_test.head())
print('')
print("y_test")
print(y_test.head())

X_test
   session              item
0       69               run
1     2358               run
2     3808               run
3     4101     mountain bike
4     4434  bike (transport)

y_test
   session              item
0       69               run
1     2358               run
2     3808               run
3     4101               run
4     4434  bike (transport)


In [9]:
g4r = Gru4Rec(n_classes = dataset_train.n_items)
g4r.fit(dataset_train)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
GRU_1 (GRU)                  multiple                  21888     
_________________________________________________________________
dense (Dense)                multiple                  3120      
Total params: 25,008
Trainable params: 25,008
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluation: calculate and visualize top-3 test accuracy

To evaluate the model, we predict the last item in each session based on the previous items. Since the network is stateful, the batch size cannot be modified (at least in Keras), so we must always predict batch_size elements at once.

First, we calculate and store the final hidden states for all sessions in the training set. This is done in sequence (although it could be parallelized) and takes a few minutes to complete.

In [13]:
final_states = np.empty(shape = (dataset_train.n_sessions, g4r.n_layers, g4r.n_hidden)) # final states will be stored here
final_states[:] = None
done = [False] * dataset_train.n_sessions   # keep track of the sessions for which the last state has already been calculated

stacks = [dataset_train.extract_session(i)[::-1] for i in range(g4r.batch_size)]
next_session_id = g4r.batch_size
batch_idx_to_session = np.arange(g4r.batch_size)   # keep track of which session is in each batch element
X = np.empty(shape = (g4r.batch_size, 1, g4r.n_classes))

g4r.model.reset_states()    # all hidden states set to 0 (starting point)

n_done = 0
while n_done < dataset_train.n_sessions:
    for i in range(g4r.batch_size):
        while len(stacks[i]) == 1:  # stack i is at the end
            if not done[batch_idx_to_session[i]]:
                # save final hidden state
                final_states[batch_idx_to_session[i], :] = np.array([layer.states[0][i, :] for layer in g4r.model.layers if g4r._is_GRU_layer(layer)])
                done[batch_idx_to_session[i]] = True
                n_done += 1
                if n_done % 100 == 0:
                    print("Progress: {} / {}".format(n_done, dataset_train.n_sessions))
            if next_session_id >= dataset_train.n_sessions: # restart from the beginning (just to reach required batch size)
                next_session_id = 0
            stacks[i] = dataset_train.extract_session(next_session_id)[::-1]
            batch_idx_to_session[i] = next_session_id
            next_session_id += 1
            g4r._reset_hidden(i)   # session has changed --> reset corresponding hidden state
        X[i, 0] = stacks[i].pop()

    _ = g4r.model.predict(X)   # hidden states get updated when "predict" is called

print("All final hidden states calculated")
np.save('final_states.npy', final_states, allow_pickle = False)

Progress: 100 / 1082
Progress: 200 / 1082
Progress: 300 / 1082
Progress: 400 / 1082
Progress: 500 / 1082
Progress: 600 / 1082
Progress: 700 / 1082
Progress: 800 / 1082
Progress: 900 / 1082
Progress: 1000 / 1082
All final hidden states calculated


We can now calculate predictions on the test set and accuracy metrics.

In [14]:
final_states = np.load('final_states.npy')

g4r.model.reset_states()

rem = dataset_train.n_sessions % g4r.batch_size
if rem > 0:
    X_test = pd.concat((X_test, X_test[:(g4r.batch_size - rem)]), axis = 0)

# Calculate next item predictions for all sessions
y_pred = np.empty(shape = (dataset_train.n_sessions, g4r.n_classes))
y_pred[:] = None
X = np.empty(shape = (g4r.batch_size, 1, g4r.n_classes))
for batch_id in range(dataset_train.n_sessions // g4r.batch_size):
    # X contains the penultimate item in the session (= last item in the training set)
    X[:] = None
    for i in range(g4r.batch_size):
        X[i, :] = dataset_train.item_to_one_hot(X_test.iloc[batch_id * g4r.batch_size + i]['item'])
    # set hidden states equal to final hidden states for sessions in the batch
    nlg = 0
    for nl, layer in enumerate(g4r.model.layers):
        if g4r._is_GRU_layer(layer):
            g4r.model.layers[nl].reset_states(final_states[batch_id * g4r.batch_size : (batch_id + 1) * g4r.batch_size, nlg, :])
            nlg += 1
    # objective: predict last element in the session
    y_pred[batch_id * g4r.batch_size : (batch_id + 1) * g4r.batch_size, :] = g4r.model.predict(X)[:g4r.batch_size]

y_pred = tf.constant(y_pred[:dataset_train.n_sessions], dtype = tf.float32)

In [15]:
# Retrieve ground truths
y_true = np.empty(shape = (dataset_train.n_sessions, dataset_train.n_items))
for i in range(y_true.shape[0]):
    y_true[i, :] = dataset_train.item_to_one_hot(y_test.item.values[i])
y_true = tf.constant(y_true, dtype = tf.float32)

In [16]:
acc       = (tf.reduce_sum(tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k = 1)) / y_true.shape[0]).numpy()
top_3_acc = (tf.reduce_sum(tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k = 3)) / y_true.shape[0]).numpy()

print("Accuracy = {}".format(acc))
print("Top-3 accuracy = {}".format(top_3_acc))

Accuracy = 0.8133087158203125
Top-3 accuracy = 0.9611830115318298


For this particular problem, which is quite simple, the performances are not significantly better than those of simpler baselines (e.g. a linear autoregressive model).