In [1]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from recommender import dataset
from recommender.model import ECommerceModel

tf.random.set_seed(42)

%load_ext autoreload
%autoreload 2 

In [2]:
devices = tf.config.list_physical_devices('GPU')
devices

2022-06-06 09:14:58.166846: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2022-06-06 09:14:58.166874: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: ubuntudavid-ThinkPad-P15-Gen-1
2022-06-06 09:14:58.166881: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: ubuntudavid-ThinkPad-P15-Gen-1


[]

2022-06-06 09:14:58.166933: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.129.6
2022-06-06 09:14:58.166951: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.129.6
2022-06-06 09:14:58.166956: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.129.6


In [3]:
data = pd.read_csv(
    'data/Online-Retail.csv',
    dtype={'CustomerID': str, 'StockCode': str},
    parse_dates=['InvoiceDate'],
)
data = dataset.preprocess_data(data)

# item_id -> description
item_to_description = dict(data[['StockCode', 'Description']].values)

print(f'Data shape {data.shape}')
data.head()

Data shape (18067, 9)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,timestamp
438319,574311,23382,BOX OF 6 CHRISTMAS CAKE DECORATIONS,6,2011-11-03 16:56:00,3.75,15640,United Kingdom,1320339000.0
28901,538662,79321,CHILLI LIGHTS,24,2010-12-13 15:44:00,4.25,15159,United Kingdom,1292255000.0
466055,576301,22470,HEART OF WICKER LARGE,4,2011-11-14 14:40:00,2.95,14667,United Kingdom,1321282000.0
194702,553663,21080,SET/20 RED RETROSPOT PAPER NAPKINS,3,2011-05-18 12:13:00,0.85,14527,United Kingdom,1305721000.0
55819,540999,21633,SUNFLOWER DECORATIVE PARASOL,30,2011-01-13 10:08:00,3.95,13694,United Kingdom,1294913000.0


In [4]:
purchases = dataset.create_tf_dataset(data)

2022-06-06 09:15:00.841542: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
interactions = purchases.map(lambda x: {
    'user_id': x['user_id'], 
    'item_id': x['item_id'],
    'timestamp': x['timestamp']
})
users = purchases.map(lambda x: x['user_id'])
items = purchases.map(lambda x: x['item_id'])

Now we prepare the timestamps

In [6]:
timestamps = np.concatenate(list(purchases.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

unique_items_titles = np.unique(np.concatenate(list(items.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(interactions.batch(1_000).map(
    lambda x: x["user_id"]))))

In [7]:
class UserModel(tf.keras.Model):
    def __init__(self, use_timestamps):
        super().__init__()

        self._use_timestamps = use_timestamps

        self.user_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_user_ids, mask_token=None
                ),
                tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
            ]
        )

        if use_timestamps:
            self.timestamp_embedding = tf.keras.Sequential(
                [
                    tf.keras.layers.Discretization(timestamp_buckets.tolist()),
                    tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
                ]
            )
            self.normalized_timestamp = tf.keras.layers.Normalization(axis=None)

            self.normalized_timestamp.adapt(timestamps)

    def call(self, inputs):
        if not self._use_timestamps:
            return self.user_embedding(inputs["user_id"])

        return tf.concat(
            [
                self.user_embedding(inputs["user_id"]),
                self.timestamp_embedding(inputs["timestamp"]),
                tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
            ],
            axis=1,
        )

In [8]:
class ItemModel(tf.keras.Model):
    def __init__(self):
        super().__init__()

        max_tokens = 10_000

        self.title_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_items_titles, mask_token=None
                ),
                tf.keras.layers.Embedding(len(unique_items_titles) + 1, 32),
            ]
        )

        self.title_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens)

        self.title_text_embedding = tf.keras.Sequential(
            [
                self.title_vectorizer,
                tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
                tf.keras.layers.GlobalAveragePooling1D(),
            ]
        )

        self.title_vectorizer.adapt(items)

    def call(self, titles):
        return tf.concat(
            [
                self.title_embedding(titles),
                self.title_text_embedding(titles),
            ],
            axis=1,
        )

In [9]:
class ECommerceModel(tfrs.models.Model):
    def __init__(self, use_timestamps):
        super().__init__()
        self.query_model = tf.keras.Sequential(
            [UserModel(use_timestamps), tf.keras.layers.Dense(32)]
        )
        self.candidate_model = tf.keras.Sequential(
            [ItemModel(), tf.keras.layers.Dense(32)]
        )
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=items.batch(128).map(self.candidate_model),
            ),
        )

    def compute_loss(self, features, training=False):
        # We only pass the user id and timestamp features into the query model. This
        # is to ensure that the training inputs would have the same keys as the
        # query inputs. Otherwise the discrepancy in input structure would cause an
        # error when loading the query model after saving it.
        query_embeddings = self.query_model(
            {
                "user_id": features["user_id"],
                "timestamp": features["timestamp"],
            }
        )
        movie_embeddings = self.candidate_model(features["item_id"])

        return self.task(query_embeddings, movie_embeddings)

In [10]:
len(interactions)

18067

In [11]:
shuffled = interactions.shuffle(18_067, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(17_067)
test = shuffled.skip(17_067).take(1_000)

cached_train = train.shuffle(17_067).batch(2048)
cached_test = test.batch(512).cache()

## Baseline: no timestamp features
We're ready to try out our first model: let's start with not using timestamp features to establish our baseline.

In [None]:
model = ECommerceModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs", update_freq='epoch')

model.fit(
    x=train.batch(4096),
    validation_data=test.batch(4096),
    epochs=5,
    callbacks=[tensorboard_callback]
)

Epoch 1/5

In [68]:
train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Top-100 accuracy (train): 0.33.
Top-100 accuracy (test): 0.31.


In [75]:
list(x)

[{'user_id': b'15640', 'item_id': b'23382', 'timestamp': 1320339360.0}]

In [76]:
interactions.take(1)

<TakeDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'item_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.float64, name=None)}>

In [None]:
x = interactions.take(1).as_numpy_iterator()

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(
    unique_items.batch(128).map(lambda title: (title, model.candidate_model(title)))
)

scores, items = index(np.array(42))

In [69]:
user_id = '13089'

past_purchases = data[data['CustomerID'] == user_id].sort_values(by='InvoiceDate')
print(f"History :\n {past_purchases[['StockCode', 'Description']].tail(10)}\n")

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(
    unique_items.batch(128).map(lambda title: (title, model.candidate_model(title)))
)

# Get some recommendations.
scores, items = index(np.array([user_id]))
recommendations = pd.DataFrame()
recommendations['Item_id'] = items.numpy().flatten()
recommendations['Description'] = recommendations['Item_id'].apply(lambda x: x.decode('utf-8')).map(item_to_description)
recommendations['Scores'] = scores.numpy().flatten()

print(f'Recommendations\n {recommendations}')

History :
        StockCode                        Description
407540     23556      LANDMARK FRAME COVENT GARDEN 
448490     22712                   CARD DOLLY GIRL 
467556     82484  WOOD BLACK BOARD ANT WHITE FINISH
478575     23389             SPACEBOY MINI BACKPACK
518061     22294         HEART FILIGREE DOVE  SMALL
518861     23084                 RABBIT NIGHT LIGHT
526603     23371    SET 36 COLOUR PENCILS SPACEBOY 
526610     23209           LUNCH BAG VINTAGE DOILY 
526619     22835    HOT WATER BOTTLE I AM SO POORLY
526658     22457    NATURAL SLATE HEART CHALKBOARD 



TypeError: Exception encountered when calling layer "user_model_7" (type UserModel).

Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'user_id'

Call arguments received:
  • inputs=tf.Tensor(shape=(1,), dtype=string)

## Capturing time dynamics with time features
Do the result change if we add time features?

In [54]:
model = ECommerceModel(use_timestamps=True)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs", update_freq='epoch')

model.fit(
    x=train_dataset.batch(4096),
    validation_data=val_dataset.batch(4096),
    epochs=5,
    callbacks=[tensorboard_callback]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f72787f71d0>

In [55]:
train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Top-100 accuracy (train): 0.25.
Top-100 accuracy (test): 0.24.


In [58]:
unique_items = {'item_id': data['StockCode'].unique()}
unique_items = tf.data.Dataset.from_tensor_slices(unique_items)

unique_items = unique_items.map(lambda x: x['item_id'])

In [65]:
list(interactions.take(1).as_numpy_iterator())

[{'user_id': b'15640', 'item_id': b'23382', 'timestamp': 1320339360.0}]

In [66]:
user_id = '13089'

past_purchases = data[data['CustomerID'] == user_id].sort_values(by='InvoiceDate')
print(f"History :\n {past_purchases[['StockCode', 'Description']].tail(10)}\n")

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(
    unique_items.batch(128).map(lambda title: (title, model.candidate_model(title)))
)

# Get some recommendations.
scores, items = index(np.array([user_id]))
recommendations = pd.DataFrame()
recommendations['Item_id'] = items.numpy().flatten()
recommendations['Description'] = recommendations['Item_id'].apply(lambda x: x.decode('utf-8')).map(item_to_description)
recommendations['Scores'] = scores.numpy().flatten()

print(f'Recommendations\n {recommendations}')

History :
        StockCode                        Description
407540     23556      LANDMARK FRAME COVENT GARDEN 
448490     22712                   CARD DOLLY GIRL 
467556     82484  WOOD BLACK BOARD ANT WHITE FINISH
478575     23389             SPACEBOY MINI BACKPACK
518061     22294         HEART FILIGREE DOVE  SMALL
518861     23084                 RABBIT NIGHT LIGHT
526603     23371    SET 36 COLOUR PENCILS SPACEBOY 
526610     23209           LUNCH BAG VINTAGE DOILY 
526619     22835    HOT WATER BOTTLE I AM SO POORLY
526658     22457    NATURAL SLATE HEART CHALKBOARD 



TypeError: Exception encountered when calling layer "user_model_6" (type UserModel).

Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'user_id'

Call arguments received:
  • inputs=tf.Tensor(shape=(1,), dtype=string)