# TensorFlow Recommenders: Quickstart

In this tutorial, we build a simple matrix factorization model using the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/) with TFRS. We can use this model to recommend movies for a given user.

# Import TensorFlow Recommender System (TFRS)

In [1]:
!pip install -q sagemaker==2.9.2
!pip install -q sagemaker-experiments==0.1.24
!pip install -q tensorflow==2.3.0
!pip install -q tensorflow-recommenders==0.2.0
!pip install -q tensorflow-datasets==4.0.0

In [2]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

# Load Ratings

In [3]:
ratings = tfds.load('movielens/100k-ratings',                     
                    download=False,
                    data_dir='./tensorflow_datasets/',
                    split='train')
print(ratings)

<PrefetchDataset shapes: {bucketized_user_age: (), movie_genres: (None,), movie_id: (), movie_title: (), raw_user_age: (), timestamp: (), user_gender: (), user_id: (), user_occupation_label: (), user_occupation_text: (), user_rating: (), user_zip_code: ()}, types: {bucketized_user_age: tf.float32, movie_genres: tf.int64, movie_id: tf.string, movie_title: tf.string, raw_user_age: tf.float32, timestamp: tf.int64, user_gender: tf.bool, user_id: tf.string, user_occupation_label: tf.int64, user_occupation_text: tf.string, user_rating: tf.float32, user_zip_code: tf.string}>


In [4]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"]
})
print(ratings)

<MapDataset shapes: {movie_title: (), user_id: ()}, types: {movie_title: tf.string, user_id: tf.string}>


# Load Movies

In [5]:
movies = tfds.load('movielens/100k-movies', 
                   download=False,                   
                   data_dir='./tensorflow_datasets/',
                   split='train')
print('Movies BEFORE', movies)

Movies BEFORE <PrefetchDataset shapes: {movie_genres: (None,), movie_id: (), movie_title: ()}, types: {movie_genres: tf.int64, movie_id: tf.string, movie_title: tf.string}>


In [6]:
movies = movies.map(lambda x: x["movie_title"])
print('Movies AFTER', movies)

Movies AFTER <MapDataset shapes: (), types: tf.string>


# Create Vocabularies
Build vocabularies to convert user ids and movie titles into integer indices for embedding layers:

In [7]:
user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings.map(lambda x: x["user_id"]))

movie_titles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies)

In [8]:
print(user_ids_vocabulary.get_vocabulary())

['[UNK]', '405', '655', '13', '450', '276', '416', '537', '303', '234', '393', '181', '279', '429', '846', '7', '94', '682', '308', '92', '293', '222', '201', '59', '435', '378', '880', '417', '896', '592', '796', '758', '561', '130', '406', '551', '334', '804', '268', '474', '889', '269', '727', '399', '642', '916', '145', '650', '363', '151', '524', '749', '194', '387', '90', '648', '291', '864', '311', '747', '85', '286', '327', '653', '328', '385', '299', '497', '95', '271', '457', '18', '301', '532', '374', '805', '178', '1', '389', '870', '716', '883', '833', '472', '437', '313', '533', '881', '280', '339', '504', '184', '788', '894', '666', '314', '506', '932', '886', '798', '244', '343', '707', '606', '454', '109', '373', '354', '782', '62', '345', '790', '487', '207', '622', '892', '407', '588', '500', '774', '660', '312', '305', '711', '43', '535', '919', '854', '456', '618', '200', '102', '49', '495', '87', '6', '851', '868', '60', '256', '643', '452', '144', '843', '807', '

# Create the Model

We can define a TFRS model by inheriting from `tfrs.Model` and implementing the `compute_loss` method:

In [9]:
class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_embedding: tf.keras.Model,
      movie_embeddings: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_embeddings = user_embeddings
    self.movie_embeddings = movie_embeddings

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed using the retrieval task
    user_embeddings = self.user_embeddings(features['user_id'])
    movie_embeddings = self.movie_embeddings(features['movie_title'])

    return self.task(user_embeddings, movie_embeddings)

# Define User and Movie Models
Define the two models.

In [10]:
user_embeddings = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocab_size(), 128)
])

movie_embeddings = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), 128)
])

# Define the Retrieval Task

In [11]:
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    movies.batch(128).map(movie_embeddings)
  )
)

# Train the Retrieval Model

In [12]:
model = MovieLensModel(user_embeddings, movie_embeddings, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

In [13]:
# Train.
model.fit(ratings.batch(4096), epochs=1)



















<tensorflow.python.keras.callbacks.History at 0x7f57e063f250>

# Make Predictions
Use brute-force search to set up retrieval using the trained representations.

In [14]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_embeddings)
index.index(movies.batch(100).map(model.movie_embeddings), movies)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f57e02e20d0>

In [15]:
import os

local_model_dir_bruteforce_model = './exported_models/bruteforce_model/'

tensorflow_saved_model_path_bruteforce_model = os.path.join(local_model_dir_bruteforce_model, 'tensorflow/saved_model/0')

os.makedirs(tensorflow_saved_model_path_bruteforce_model, exist_ok=True)

In [16]:
k = 5
user_id = "42"

_, titles = index(np.array([user_id]))

print(f"Top {k} recommendations for user {user_id}: {titles[0, :k]}")

Top 5 recommendations for user 42: [b'Just Cause (1995)' b'Rent-a-Kid (1995)' b'Aristocats, The (1970)'
 b'Nell (1994)' b'Outbreak (1995)']


In [17]:
print('Compiled model {}'.format(index))          
print(index.summary())

Compiled model <tensorflow_recommenders.layers.factorized_top_k.BruteForce object at 0x7f57e02e20d0>
Model: "brute_force"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential (Sequential)      (None, 128)               120832    
Total params: 337,810
Trainable params: 120,832
Non-trainable params: 216,978
_________________________________________________________________
None


In [18]:
index.save(tensorflow_saved_model_path_bruteforce_model, save_format='tf')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: ./exported_models/bruteforce_model/tensorflow/saved_model/0/assets


INFO:tensorflow:Assets written to: ./exported_models/bruteforce_model/tensorflow/saved_model/0/assets


In [19]:
!saved_model_cli show --all --dir $tensorflow_saved_model_path_bruteforce_model

2020-11-03 06:25:32.286375: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2020-11-03 06:25:32.286412: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.

MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['__saved_model_init_op']:
  The given SavedModel SignatureDef contains the following input(s):
  The given SavedModel SignatureDef contains the following output(s):
    outputs['__saved_model_init_op'] tensor_info:
        dtype: DT_INVALID
        shape: unknown_rank
        name: NoOp
  Method name is: 

signature_def['serving_default']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['input_1'] tensor_info:
        dtype: DT_STRING
        shape: (-1)
        name: serving_default_in

The following works!


```
!saved_model_cli run --input_exprs 'input_1=[str(42)]' --tag_set serve --signature_def serving_default --dir $tensorflow_saved_model_path_bruteforce_model
```

```
Result for output key output_1:
[[3.9453535 2.9104383 2.7285933 2.6687438 2.5107827 2.4038887 2.3974428
  2.386409  2.321732  2.3211298]]
Result for output key output_2:
[[b'Rent-a-Kid (1995)' b'Last Dance (1996)'
  b'Adventures of Pinocchio, The (1996)'
  b'Winnie the Pooh and the Blustery Day (1968)'
  b'Aristocats, The (1970)' b'Celtic Pride (1996)'
  b'Conan the Barbarian (1981)' b'House Arrest (1996)'
  b'Just Cause (1995)' b'Johnny 100 Pesos (1993)']]
```

In [20]:
!saved_model_cli run --input_exprs 'input_1=["$user_id"]' --tag_set serve --signature_def serving_default --dir $tensorflow_saved_model_path_bruteforce_model

2020-11-03 06:25:36.132006: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2020-11-03 06:25:36.132051: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2020-11-03 06:25:37.361429: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-11-03 06:25:37.361469: W tensorflow/stream_executor/cuda/cuda_driver.cc:312] failed call to cuInit: UNKNOWN ERROR (303)
2020-11-03 06:25:37.361505: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (datascience-1-0-ml-t3-medium-1abf3407f667f989be9d86559395): /proc/driver/nvidia/version does not exist
2020-11-03 06:25:37

# All in one cell

In [21]:
import time
import random
import pandas as pd
from glob import glob
import pprint
import argparse
import json
import subprocess
import sys
import os

# Set the directory you want to start from
#for dirName, subdirList, fileList in os.walk('/etc/ssl/certs/'):
#    print('Found directory: %s' % dirName)
#    for fname in fileList:
#        print('\t%s' % fname)        
        
from typing import Dict, Text
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import numpy as np


class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_embedding: tf.keras.Model,
      movie_embeddings: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_embeddings = user_embeddings
    self.movie_embeddings = movie_embeddings

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed using the retrieval task
    user_embeddings = self.user_embeddings(features['user_id'])
    movie_embeddings = self.movie_embeddings(features['movie_title'])

    return self.task(user_embeddings, movie_embeddings)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
     
    args, _ = parser.parse_known_args()
    print("Args:") 
    print(args)
    
    env_var = os.environ 
    print("Environment Variables:") 
    pprint.pprint(dict(env_var), width = 1) 

    local_model_dir = './model' # os.environ['SM_MODEL_DIR']
    output_dir = './output' # args.output_dir
    print('output_dir {}'.format(output_dir))    

    train_data = './tensorflow_datasets/' # args.train_data
    print('train_data {}'.format(train_data))
    epochs = 1 # args.epochs
    print('epochs {}'.format(epochs))    
    learning_rate = 0.5 # args.learning_rate
    print('learning_rate {}'.format(learning_rate))    
    enable_tensorboard = False # args.enable_tensorboard
    print('enable_tensorboard {}'.format(enable_tensorboard))       
    dataset_variant = '100k' # args.dataset_variant
    print('dataset_variant {}'.format(dataset_variant))
    embedding_dimension = int(256) # int(args.embedding_dimension)
    print('embedding_dimension {}'.format(embedding_dimension))       

    # Load the ratings data to use for training
    ratings = tfds.load('movielens/{}-ratings'.format(dataset_variant), 
                        download=False,
                        data_dir=train_data,
                        split='train')
    print('Ratings raw', ratings)

    # Transform the ratings data specific to our training task
    ratings = ratings.map(lambda x: {
        'movie_title': x['movie_title'],
        'user_id': x['user_id']
    })
    print('Ratings transformed', ratings)    

    # Load the movies data to use for training
    movies = tfds.load('movielens/{}-movies'.format(dataset_variant),
                       download=False,
                       data_dir=train_data,
                       split='train')
    print('Movies raw', movies)
    
    # Transform the movies data specific to our training task
    movies = movies.map(lambda x: x['movie_title'])
    print('Movies transformed', movies)

    # Create the user vocabulary and user embeddings
    user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
    user_ids_vocabulary.adapt(ratings.map(lambda x: x['user_id']))

    user_embeddings = tf.keras.Sequential([
        user_ids_vocabulary,
        tf.keras.layers.Embedding(user_ids_vocabulary.vocab_size(),
                                  embedding_dimension)
    ])

    # Create the movie vocabulary and movie embeddings
    movie_titles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
    movie_titles_vocabulary.adapt(movies)

    movie_embeddings = tf.keras.Sequential([
        movie_titles_vocabulary,
        tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(),
                                  embedding_dimension)
    ])

    # Specify the task and the top-k metric to optimize during model training
    task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
        movies.batch(128).map(movie_embeddings)
    ))

    # Define the optimizer and hyper-parameters
    optimizer = tf.keras.optimizers.Adagrad(learning_rate)
    print('Optimizer:  {}'.format(optimizer))

    # Setup the callbacks to use during training
    callbacks = []

    # Setup the Tensorboard callback if Tensorboard is enabled
    if enable_tensorboard: 
        # Tensorboard Logs 
        tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/')
        os.makedirs(tensorboard_logs_path, exist_ok=True)

        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard_logs_path)
        print('Adding Tensorboard callback {}'.format(tensorboard_callback))
        callbacks.append(tensorboard_callback)
    print('Callbacks: {}'.format(callbacks))

    # Create a custom Keras model with the user embeddings, movie embeddings, and optimization task
    model = MovieLensModel(user_embeddings, movie_embeddings, task)
    
    # Compile the model and prepare for training
    model.compile(optimizer=optimizer)

    # Train the model
    model.fit(ratings.batch(4096), epochs=epochs)

    # Make some sample predictions to test our model
    # Note:  This is required to save and server our model with TensorFlow Serving
    #        See https://github.com/tensorflow/tensorflow/issues/31057 for more  details.
    index = tfrs.layers.factorized_top_k.BruteForce(query_model=model.user_embeddings)
    index.index(movies.batch(100).map(model.movie_embeddings), movies)

    user_id = '42'
    _, titles = index(np.array([user_id]))

    k = 10
    print(f'Top {k} recommendations for user {user_id}: {titles[0, :k]}')

    # Print a summary of our recommender model
    print('Trained index {}'.format(index))
    print(index.summary())

    # Save the TensorFlow SavedModel for Serving Predictions
    # SavedModel Output
    tensorflow_saved_model_path = os.path.join(local_model_dir,
                                               'tensorflow/saved_model/0')
    os.makedirs(tensorflow_saved_model_path, exist_ok=True)
    
    print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))
    index.save(tensorflow_saved_model_path, save_format='tf')

Args:
Namespace()
Environment Variables:
{'AWS_ACCOUNT_ID': '835319576252',
 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/_sagemaker-instance-credentials/32ca01267e64c8dc379263a641297b882d1245c4f90e6ae159a4aaf6651942d1',
 'AWS_DEFAULT_REGION': 'us-east-1',
 'AWS_INTERNAL_IMAGE_OWNER': 'Studio',
 'AWS_REGION': 'us-east-1',
 'CLICOLOR': '1',
 'GIT_PAGER': 'cat',
 'HOME': '/root',
 'HOSTNAME': 'datascience-1-0-ml-t3-medium-1abf3407f667f989be9d86559395',
 'JPY_PARENT_PID': '8',
 'JUPYTER_PATH': '/opt/conda/share/jupyter/',
 'KERNEL_GATEWAY': '1',
 'KERNEL_LAUNCH_TIMEOUT': '40',
 'KERNEL_WORKING_PATH': 'workshop/02_usecases/sagemaker_recommendations',
 'LANG': 'C.UTF-8',
 'LC_ALL': 'C.UTF-8',
 'MPLBACKEND': 'module://ipykernel.pylab.backend_inline',
 'PAGER': 'cat',
 'PATH': '/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin:/tmp/miniconda3/condabin:/tmp/anaconda3/condabin:/tmp/miniconda2/condabin:/tmp/anaconda2/condabin',
 'PWD': '/r















Top 10 recommendations for user 42: [b'Aristocats, The (1970)' b'Deer Hunter, The (1978)'
 b'Adventures of Robin Hood, The (1938)' b"Preacher's Wife, The (1996)"
 b'Dolores Claiborne (1994)' b'Just Cause (1995)' b'M. Butterfly (1993)'
 b'Smoke (1995)' b'GoldenEye (1995)' b'Bronx Tale, A (1993)']
Trained index <tensorflow_recommenders.layers.factorized_top_k.BruteForce object at 0x7f57e008b050>
Model: "brute_force_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_2 (Sequential)    (None, 256)               241664    
Total params: 673,938
Trainable params: 241,664
Non-trainable params: 432,274
_________________________________________________________________
None
tensorflow_saved_model_path ./model/tensorflow/saved_model/0
INFO:tensorflow:Assets written to: ./model/tensorflow/saved_model/0/assets


INFO:tensorflow:Assets written to: ./model/tensorflow/saved_model/0/assets


In [22]:
user_id = "42"

!saved_model_cli run --input_exprs 'input_1=np.array(["$user_id"])' --tag_set serve --signature_def serving_default --dir ./model/tensorflow/saved_model/0

2020-11-03 06:26:28.684049: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2020-11-03 06:26:28.684089: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2020-11-03 06:26:31.199874: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-11-03 06:26:31.199918: W tensorflow/stream_executor/cuda/cuda_driver.cc:312] failed call to cuInit: UNKNOWN ERROR (303)
2020-11-03 06:26:31.199959: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (datascience-1-0-ml-t3-medium-1abf3407f667f989be9d86559395): /proc/driver/nvidia/version does not exist
2020-11-03 06:26:31