In [254]:
import tensorflow as tf
import numpy as np
import pandas as pd

class SessionDataset:

      def __init__(self, df):

          self.df = df.sort_values(by = ['session', 'timestamp']).reset_index(drop = True) # session (int) | timestamp (int) | item (string)
          self.offsets    = np.concatenate((np.zeros(1, dtype = np.int32), self.df.groupby('session').size().cumsum().values))
          self.n_sessions = len(self.offsets) - 1

          self.item_to_id = {item : i for i, item in enumerate(self.df.item.unique())}
          self.id_to_item = {i : item for i, item in self.item_to_id.items()}

          self.n_items = len(self.item_to_id)
          self.item_to_one_hot = {item : tf.one_hot(self.item_to_id[item], depth = self.n_items) for item in self.item_to_id.keys()}

      def extract_session(self, i, one_hot_encoded = True):

          session = self.df[self.offsets[i]:self.offsets[i+1]].copy()
          if one_hot_encoded:
              session.loc[:, 'item'] = session.item.apply(lambda x : self.item_to_one_hot[x])
          return session.item.values.tolist()

df = pd.read_csv("workouts_clean_2.csv").sort_values(by = ['session', 'timestamp']).reset_index(drop = True)
sizes = df.groupby('session').size()
sizes = sizes[sizes > 2].index
df = df.query("session in @sizes").sort_values(by = ['session', 'timestamp']).reset_index(drop = True)

offsets = np.concatenate((np.zeros(1, dtype = np.int32), df.groupby('session').size().cumsum().values))

dfTest  = df.iloc[offsets[1:] - 1]
dfTrain = df.iloc[list(set(df.index) - set(dfTest.index))]

dataset = SessionDataset(dfTrain)

In [256]:
# LOSSES

def TOP1(y_true, y_pred):
    _y_pred = tf.expand_dims(y_pred, axis = -1)  # (BATCH_SIZE, n_classes) ---> (BATCH_SIZE, n_classes, 1) 
    mat = tf.matmul(tf.expand_dims(tf.ones_like(y_true), -1), tf.expand_dims(y_true, axis = 1)) # (BATCH_SIZE, n_classes, 1) x (BATCH_SIZE, 1, n_classes) --> (BATCH_SIZE, n_classes, n_classes)
    score_diffs = tf.matmul(mat, _y_pred) # (BATCH_SIZE, n_classes, n_classes) x (BATCH_SIZE, n_classes, 1) --> (BATCH_SIZE, n_classes, 1)
    score_diffs = tf.squeeze(score_diffs - _y_pred, -1) # (BATCH_SIZE, n_classes)
    loss_by_sample = tf.reduce_sum(tf.nn.sigmoid(tf.square(y_pred)), axis = -1) + \
                      tf.reduce_sum(tf.sigmoid(-score_diffs), axis = -1) + \
                    -tf.squeeze(tf.squeeze(tf.nn.sigmoid(tf.square(tf.matmul(tf.expand_dims(y_true, 1), _y_pred))), -1), -1)
    return tf.reduce_sum(loss_by_sample)

def BPR(y_true, y_pred):  # both inputs have shape (BATCH_SIZE, n_classes)
    _y_pred = tf.expand_dims(y_pred, axis = -1)  # (BATCH_SIZE, n_classes, 1) 
    mat = tf.matmul(tf.expand_dims(tf.ones_like(y_true), -1), tf.expand_dims(y_true, axis = 1)) # (BATCH_SIZE, n_classes, 1) x (BATCH_SIZE, 1, n_classes) = (BATCH_SIZE, n_classes, n_classes)
    score_diffs = tf.matmul(mat, _y_pred) # (BATCH_SIZE, n_classes, n_classes) x (BATCH_SIZE, n_classes, 1) = (BATCH_SIZE, n_classes, 1)
    score_diffs = tf.squeeze(score_diffs - _y_pred, -1) # (BATCH_SIZE, n_classes)
    return -tf.reduce_sum(tf.math.log(tf.nn.sigmoid(score_diffs)))


In [257]:
class Gru4Rec:

    def __init__(self, n_classes, n_layers = 1, n_hidden = 128, loss = 'TOP1', batch_size = 32):

        self.n_classes  = n_classes   # = number of items

        self.n_layers = n_layers  # number of stacked GRU layers
        self.n_hidden = n_hidden  # dimension of GRU cell's hidden state

        self.model = self.build_model()

        self.batch_size = batch_size

    def build_model(self):

        model = tf.keras.models.Sequential()
        for i in range(self.n_layers):
            model.add(tf.keras.layers.GRU(name = 'GRU_{}'.format(i+1),
                                          units      = self.n_hidden, 
                                          activation = 'relu', 
                                          stateful   = True,
                                          return_sequences = (i < self.n_layers - 1)))
        model.add(tf.keras.layers.Dense(units = self.n_classes, activation = 'linear'))

        top3accuracy = lambda y_true, y_pred: tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k = 3)
        top3accuracy.__name__ = 'top3accuracy'
        model.compile(loss = TOP1, optimizer = 'adam', metrics = ['accuracy', top3accuracy])

        return model

    def _reset_hidden(self, i):

        for nl in range(len(self.model.layers)):   # session has change: reset related hidden state
            if self.model.layers[nl].name.startswith('GRU_'):
                hidden_updated = self.model.layers[nl].states[0].numpy()
                hidden_updated[i, :] = 0.
                self.model.layers[nl].reset_states(hidden_updated)

    def train_batch_generator(self, dataset):  # session | item | timestamp

      assert dataset.n_sessions > self.batch_size, "Training set is too small"

      ixs = np.arange(dataset.n_sessions)
      stacks = [dataset.extract_session(ixs[i])[::-1] for i in range(self.batch_size)]   # events are in reverse time order
      next_session_id = self.batch_size

      X, y = np.empty(shape = (self.batch_size, 1, self.n_classes)), np.empty(shape = (self.batch_size, self.n_classes))
      while True:
          for i in range(self.batch_size):
              # 1. If stack i has only one element: change session
              if len(stacks[i]) <= 1:
                  if next_session_id >= dataset.n_sessions: # no more sessions available: shuffle sessions and restart
                      np.random.shuffle(ixs)
                      next_session_id = 0
                  stacks[i] = dataset.extract_session(ixs[next_session_id])[::-1]
                  next_session_id += 1
                  self._reset_hidden(i)
              # 2. Stack i is now valid: set input + target variables
              X[i, 0] = stacks[i].pop()
              y[i]    = stacks[i][-1]

          yield tf.constant(X, dtype = tf.float32), tf.constant(y, dtype = tf.float32)

    def fit(self, dataset, steps_per_epoch = 10000, epochs = 10):

        checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = "gru-chkpt-{epoch:02d}.hdf5")
        self.model.fit_generator(generator       = self.train_batch_generator(dataset), 
                                 steps_per_epoch = steps_per_epoch, 
                                 epochs          = epochs, 
                                 callbacks       = [checkpoint], 
                                 shuffle         = False)

    def get_final_hidden_states(self, dataset):

        final_states = np.empty(shape = (dataset.n_sessions, self.n_layers, self.n_hidden))
        final_states[:] = None
        done = [False] * dataset.n_sessions

        stacks = [dataset.extract_session(i)[::-1] for i in range(self.batch_size)]   # events are in reverse time order
        next_session_id = self.batch_size
        batch_idx_to_session = np.arange(self.batch_size)
        X = np.empty(shape = (self.batch_size, 1, self.n_classes))

        self.model.reset_states()

        n_done = 0
        while n_done < dataset.n_sessions:
            for i in range(self.batch_size):
                if len(stacks[i]) <= 1:
                    if not done[batch_idx_to_session[i]]:
                        final_states[batch_idx_to_session[i], :] = np.array([layer.states[0][i, :] for layer in self.model.layers if layer.name.startswith('GRU_')])
                        done[batch_idx_to_session[i]] = True
                        n_done += 1
                        print("{} / {}".format(n_done, dataset.n_sessions))
                    if next_session_id >= dataset.n_sessions: # restart from the beginning
                        next_session_id = 0
                    stacks[i] = dataset.extract_session(next_session_id)[::-1]
                    batch_idx_to_session[i] = next_session_id
                    next_session_id += 1
                    self._reset_hidden(i)   # session has changes --> reset corresponding hidden state
                X[i, 0] = stacks[i].pop()

            _ = self.model.predict(X)   # hidden states get updated
            
        return final_states





In [258]:
g4r = Gru4Rec(n_classes = dataset.n_items)
g4r.fit(dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 1349/10000 [===>..........................] - ETA: 35s - loss: 806.9630 - accuracy: 0.8047 - top3accuracy: 0.9848

KeyboardInterrupt: ignored

In [259]:
final_states = g4r.get_final_hidden_states(dataset)

1 / 1059
2 / 1059
3 / 1059
4 / 1059
5 / 1059
6 / 1059
7 / 1059
8 / 1059
9 / 1059
10 / 1059
11 / 1059
12 / 1059
13 / 1059
14 / 1059
15 / 1059
16 / 1059
17 / 1059
18 / 1059
19 / 1059
20 / 1059
21 / 1059
22 / 1059
23 / 1059
24 / 1059
25 / 1059
26 / 1059
27 / 1059
28 / 1059
29 / 1059
30 / 1059
31 / 1059
32 / 1059
33 / 1059
34 / 1059
35 / 1059
36 / 1059
37 / 1059
38 / 1059
39 / 1059
40 / 1059
41 / 1059
42 / 1059
43 / 1059
44 / 1059
45 / 1059
46 / 1059
47 / 1059
48 / 1059
49 / 1059
50 / 1059
51 / 1059
52 / 1059
53 / 1059
54 / 1059
55 / 1059
56 / 1059
57 / 1059
58 / 1059
59 / 1059
60 / 1059
61 / 1059
62 / 1059
63 / 1059
64 / 1059
65 / 1059
66 / 1059
67 / 1059
68 / 1059
69 / 1059
70 / 1059
71 / 1059
72 / 1059
73 / 1059
74 / 1059
75 / 1059
76 / 1059
77 / 1059
78 / 1059
79 / 1059
80 / 1059
81 / 1059
82 / 1059
83 / 1059
84 / 1059
85 / 1059
86 / 1059
87 / 1059
88 / 1059
89 / 1059
90 / 1059
91 / 1059
92 / 1059
93 / 1059
94 / 1059
95 / 1059
96 / 1059
97 / 1059
98 / 1059
99 / 1059
100 / 1059
101 / 10

In [260]:
np.save('final_states.npy', final_states, allow_pickle = False)

In [261]:
final_states = np.load('final_states.npy')

_last_states = dataset.df.iloc[dataset.offsets[:-1] - 1].sort_values(by = ['session']).item.values

g4r.model.reset_states()

y = np.empty(shape = (dataset.n_sessions, g4r.n_classes))
y[:] = None
X = np.empty(shape = (g4r.batch_size, 1, g4r.n_classes))
for batch_id in range(len(_last_states) // g4r.batch_size):
    X[:] = None
    for i in range(g4r.batch_size):
        X[i, :] = dataset.item_to_one_hot[_last_states[batch_id * g4r.batch_size + i]]
    nlg = 0
    for nl in range(len(g4r.model.layers)):
        if g4r.model.layers[nl].name.startswith('GRU_'):
            g4r.model.layers[nl].reset_states(final_states[batch_id * g4r.batch_size : (batch_id + 1) * g4r.batch_size, nlg, :])
            nlg += 1
    y[batch_id * g4r.batch_size : (batch_id + 1) * g4r.batch_size, :] = g4r.model.predict(X)[:g4r.batch_size]

y = tf.constant(y[:32 * 33], dtype = tf.float32)

In [262]:
y_true = np.empty(shape = (dataset.n_sessions, dataset.n_items))
for i in range(y_true.shape[0]):
    y_true[i, :] = dataset.item_to_one_hot[dfTest.item.values[i]]

y_true = tf.constant(y_true[:32*33], dtype = tf.float32)

In [263]:
tf.reduce_sum(tf.keras.metrics.top_k_categorical_accuracy(y_true, y, k = 3)) / y_true.shape[0]

<tf.Tensor: shape=(), dtype=float32, numpy=0.9640151>