In [2]:
### Import necessary libraries

from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_recommenders as tfrs

import os
import pprint
import tempfile

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
masterdf = pd.read_csv('../data_cleaned/brazildata_mod.csv')
masterdf.head(3)

Unnamed: 0.1,Unnamed: 0,order_id,order_purchase_timestamp,user_id,customer_city,customer_state,product_category,quantity,price,review_score,timestamp,product_code,product_id
0,0,e481f51cbdc54678b7cc49136f2d6af7,2017-10-02 10:56:33,9ef432eb6251297304e76186b10a928d,sao paulo,SP,housewares,1.0,29.99,4,1506942000.0,87285b34884572647811a353c7ac498a,housewares SKU 0
1,1,53cdb2fc8bc7dce0b6741e2150273451,2018-07-24 20:41:37,b0830fb4747a6c6d20dea0b8c802d7ef,barreiras,BA,perfumery,1.0,118.7,4,1532465000.0,595fac2a385ac33a80bd5114aec74eb8,perfumery SKU 0
2,2,47770eb9100c2d0c44946d9cf07ec65d,2018-08-08 08:38:49,41ce2a54c0b03bf3443c3d931a367089,vianopolis,GO,auto,1.0,159.9,5,1533718000.0,aa4383b373c6aca5d8797843e5594415,auto SKU 0


In [4]:
### standardize item data types, especially string, float, and integer

masterdf[['user_id',      
          'product_id',  
         ]] = masterdf[['user_id','product_id']].astype(str)

# we will play around with the data type of the quantity, 
# which you shall see later it affects the accuracy of the prediction.

masterdf['quantity'] = masterdf['quantity'].astype(float)

In [5]:
### define interactions data and user data

### interactions 
### here we create a reference table of the user , item, and quantity purchased
interactions_dict = masterdf.groupby(['user_id', 'product_id', 'timestamp'])[ 'quantity'].sum().reset_index()

## we tansform the table inta a dictionary , which then we feed into tensor slices
# this step is crucial as this will be the type of data fed into the embedding layers
interactions_dict = {name: np.array(value) for name, value in interactions_dict.items()}
interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)

## we do similar step for item, where this is the reference table for items to be recommended
items_dict = masterdf[['product_id']].drop_duplicates()
items_dict = {name: np.array(value) for name, value in items_dict.items()}
items = tf.data.Dataset.from_tensor_slices(items_dict)

## map the features in interactions and items to an identifier that we will use throught the embedding layers
## do it for all the items in interaction and item table
## you may often get itemtype error, so that is why here i am casting the quantity type as float to ensure consistency
interactions = interactions.map(lambda x: {
    'user_id' : x['user_id'], 
    'product_id' : x['product_id'], 
    'quantity' : float(x['quantity']),
        "timestamp": x["timestamp"]
})

items = items.map(lambda x: x['product_id'])

In [6]:
## Basic housekeeping to prepare feature vocabularies

## timestamp is an exmaple of continuous features, which needs to be rescaled, or otherwise it will be 
## too large for the model.
## there are other methods to reduce the size of the timestamp, ,such as standardization and normalization
## here we use discretization, which puts them into buckets of categorical features, 

timestamps = np.concatenate(list(interactions.map(lambda x: x["timestamp"]).batch(100)))
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()
timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,)

item_titles = interactions.batch(10_000).map(lambda x: x["product_id"])
user_ids = interactions.batch(10_000).map(lambda x: x["user_id"])

unique_item_titles = np.unique(np.concatenate(list(item_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [7]:
tf.random.set_seed(42)
shuffled = interactions.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(60_000)
test = shuffled.skip(60_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

# Multi-Task Model with ReLU-based DNN

We can put it all together in a model class.

The new component here is that - since we have two tasks and two losses - we need to decide on how important each loss is. We can do this by giving each of the losses a weight, and treating these weights as hyperparameters. If we assign a large loss weight to the rating task, our model is going to focus on predicting ratings (but still use some information from the retrieval task); if we assign a large loss weight to the retrieval task, it will focus on retrieval instead.

In [8]:
class Model(tfrs.models.Model):

    def __init__(self,
                 rating_weight: float, retrieval_weight: float) -> None:
        # We take the loss weights in the constructor: this allows us to instantiate
        # several model objects with different loss weights.

        super().__init__()

        embedding_dimension = 32

        # item models.
        self.item_model: tf.keras.layers.Layer = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
            vocabulary=unique_item_titles, mask_token=None),
          tf.keras.layers.Embedding(len(unique_item_titles) + 1, embedding_dimension)
        ])
            
        ## user model    
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
          tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])

        # A small model to take in user and item embeddings and predict ratings.
        # We can make this as complicated as we want as long as we output a scalar
        # as our prediction.
        
        ## this is Relu-Based DNN
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(1),
        ])

        # rating and retrieval task.
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
            
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=items.batch(128).map(self.item_model)
            )
        )

        # The loss weights.
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_id"])
        
        # And pick out the item features and pass them into the item model.
        item_embeddings = self.item_model(features["product_id"])

        return (
            user_embeddings,
            item_embeddings,
            # We apply the multi-layered rating model to a concatentation of
            # user and item embeddings.
            self.rating_model(
                tf.concat([user_embeddings, item_embeddings], axis=1)
            ),
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

        ## ratings go here as a method to compute loss
        ratings = features.pop("quantity")

        user_embeddings, item_embeddings, rating_predictions = self(features)

        # We compute the loss for each task.
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, item_embeddings)

        # And combine them using the loss weights.
        return (self.rating_weight * rating_loss
                + self.retrieval_weight * retrieval_loss)

#### Rating-specialized model

Depending on the weights we assign, the model will encode a different balance of the tasks. Let's start with a model that only considers ratings.

In [9]:
model = Model(rating_weight=1.0, retrieval_weight=0.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.012.
Ranking RMSE: 2.022.


#### Retrieval-specialized model
Let's now try a model that focuses on retrieval only.

In [10]:
model = Model(rating_weight=0.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.081.
Ranking RMSE: 2.484.


#### Joint model

Let's now train a model that assigns positive weights to both tasks.

In [11]:
model = Model(rating_weight=0.5, retrieval_weight=0.5)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.083.
Ranking RMSE: 2.020.


We can see that accuracy is highest and RMSE is lowest when we combine both ranking and retrieval together.

But so far we only have timestamp as additional features. How can we incorporate all of them into one single model?