# TensorFlow Recommenders: Quickstart

In this tutorial, we build a simple matrix factorization model using the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/) with TFRS. We can use this model to recommend movies for a given user.

# Import TensorFlow Recommender System (TFRS)

In [1]:
!pip install -q tensorflow==2.3.0
!pip install -q tensorflow-recommenders==0.2.0
!pip install -q tensorflow-datasets==4.0.0

In [2]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

# Load Ratings

In [20]:
# /home/ec2-user/tensorflow_datasets/train/movielens/100k-ratings/0.1.0/

# /opt/ml/input/data/train/          movielens/100k-ratings/0.1.0

ratings = tfds.load('movielens/100k-ratings',                     
                    download=True,
                    data_dir='./tensorflow_datasets/',
                    split='train')
print(ratings)

[1mDownloading and preparing dataset movielens/25m-ratings/0.1.0 (download: 249.84 MiB, generated: 3.89 GiB, total: 4.13 GiB) to ./tensorflow_datasets/movielens/25m-ratings/0.1.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
# ratings = tf.data.TFRecordDataset(
#     './data/movielens/100k-ratings/0.1.0/movielens-train.tfrecord-00000-of-00001', 
# #    './data/movielens/100k-ratings/0.1.0/', 
#     compression_type=None, 
#     buffer_size=None, 
#     num_parallel_reads=None
# )

# list(ratings.batch(10).as_numpy_iterator())

In [4]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"]
})
print(ratings)

<MapDataset shapes: {movie_title: (), user_id: ()}, types: {movie_title: tf.string, user_id: tf.string}>


# Load Movies

In [5]:
movies = tfds.load('movielens/100k-movies', 
                   download=True,                   
                   data_dir='./tensorflow_datasets/',
                   split='train')
print('Movies BEFORE', movies)

Movies BEFORE <PrefetchDataset shapes: {movie_genres: (None,), movie_id: (), movie_title: ()}, types: {movie_genres: tf.int64, movie_id: tf.string, movie_title: tf.string}>


# TODO:  Shuffle:  https://github.com/tensorflow/recommenders/blob/main/docs/examples/basic_retrieval.ipynb

In [None]:
# tf.random.set_seed(42)
# shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

# train = shuffled.take(80_000)
# test = shuffled.skip(80_000).take(20_000)

In [None]:
# movies = tf.data.TFRecordDataset(
#     './data/movielens/100k-movies/0.1.0/movielens-train.tfrecord-00000-of-00001', 
#     compression_type=None, 
#     buffer_size=None, 
#     num_parallel_reads=None
# )

# list(movies.batch(10).as_numpy_iterator())

In [6]:
movies = movies.map(lambda x: x["movie_title"])
print('Movies AFTER', movies)

Movies AFTER <MapDataset shapes: (), types: tf.string>


In [None]:
#print(type(ratings))
#print(type(movies))

# Create Vocabularies
Build vocabularies to convert user ids and movie titles into integer indices for embedding layers:

In [7]:
user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings.map(lambda x: x["user_id"]))

movie_titles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies)

In [None]:
#print(user_ids_vocabulary.get_vocabulary())

# Create the Model

We can define a TFRS model by inheriting from `tfrs.Model` and implementing the `compute_loss` method:

In [8]:
class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      movie_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.movie_model = movie_model

    # Set up a retrieval task.
    self.task = task

#    https://github.com/tensorflow/tensorflow/issues/31057
#     @tf.function(input_signature=[tf.TensorSpec([1], tf.float32)])
#     def call(self, x, training=True, mask=None):
#         return self.d(x)
    

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.movie_model(features["movie_title"])

    return self.task(user_embeddings, movie_embeddings)

# Define User and Movie Models
Define the two models.

In [9]:
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocab_size(), 128)
])

movie_model = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), 128)
])

# Define the Retrieval Task

In [10]:
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    movies.batch(128).map(movie_model)
  )
)

# Train the Retrieval Model

In [11]:
model = MovieLensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

In [12]:
# Train.
model.fit(ratings.batch(4096), epochs=1)



















<tensorflow.python.keras.callbacks.History at 0x7f14d82d9a90>

# Make Predictions
Use brute-force search to set up retrieval using the trained representations.

In [13]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(movies.batch(100).map(model.movie_model), movies)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f1534e632b0>

In [14]:
import os

local_model_dir_bruteforce_model = './exported_models/bruteforce_model/'

tensorflow_saved_model_path_bruteforce_model = os.path.join(local_model_dir_bruteforce_model, 'tensorflow/saved_model/0')

os.makedirs(tensorflow_saved_model_path_bruteforce_model, exist_ok=True)

In [17]:
k = 5
user_id = "42"

_, titles = index(np.array([user_id]))

print(f"Top {k} recommendations for user {user_id}: {titles[0, :k]}")

Top 5 recommendations for user 42: [b'Just Cause (1995)' b'House Arrest (1996)' b'Aristocats, The (1970)'
 b'Far From Home: The Adventures of Yellow Dog (1995)' b'Nell (1994)']


In [18]:
print('Compiled model {}'.format(index))          
print(index.summary())

Compiled model <tensorflow_recommenders.layers.factorized_top_k.BruteForce object at 0x7f1534e632b0>
Model: "brute_force"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential (Sequential)      (None, 128)               120832    
Total params: 337,810
Trainable params: 120,832
Non-trainable params: 216,978
_________________________________________________________________
None


In [19]:
index.save(tensorflow_saved_model_path_bruteforce_model, save_format='tf')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: ./exported_models/bruteforce_model/tensorflow/saved_model/0/assets


INFO:tensorflow:Assets written to: ./exported_models/bruteforce_model/tensorflow/saved_model/0/assets


In [None]:
!saved_model_cli show --all --dir $tensorflow_saved_model_path_bruteforce_model

The following works!


```
!saved_model_cli run --input_exprs 'input_1=[str(42)]' --tag_set serve --signature_def serving_default --dir $tensorflow_saved_model_path_bruteforce_model
```

```
Result for output key output_1:
[[3.9453535 2.9104383 2.7285933 2.6687438 2.5107827 2.4038887 2.3974428
  2.386409  2.321732  2.3211298]]
Result for output key output_2:
[[b'Rent-a-Kid (1995)' b'Last Dance (1996)'
  b'Adventures of Pinocchio, The (1996)'
  b'Winnie the Pooh and the Blustery Day (1968)'
  b'Aristocats, The (1970)' b'Celtic Pride (1996)'
  b'Conan the Barbarian (1981)' b'House Arrest (1996)'
  b'Just Cause (1995)' b'Johnny 100 Pesos (1993)']]
```

In [None]:
!saved_model_cli run --input_exprs 'input_1=["$user_id"]' --tag_set serve --signature_def serving_default --dir $tensorflow_saved_model_path_bruteforce_model

In [None]:
import time
import random
import pandas as pd
from glob import glob
import pprint
import argparse
import json
import subprocess
import sys
import os

subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.3.0.1.0.0'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow-recommenders==0.2.0'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow-datasets==4.0.0'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
#subprocess.check_call([sys.executable, '-m', 'ls', '-al', '/etc/ssl/certs/'])
#subprocess.check_call([sys.executable, '-m', 'ln', '-s', '/etc/ssl/certs/ca-bundle.crt', '/etc/ssl/certs/ca-certificates.crt'])

# Set the directory you want to start from
#for dirName, subdirList, fileList in os.walk('/etc/ssl/certs/'):
#    print('Found directory: %s' % dirName)
#    for fname in fileList:
#        print('\t%s' % fname)        
        
from typing import Dict, Text
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import numpy as np

class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      movie_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.movie_model = movie_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.movie_model(features["movie_title"])

    return self.task(user_embeddings, movie_embeddings)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

#     parser.add_argument('--train_data', 
#                         type=str, 
#                         default=os.environ['SM_CHANNEL_TRAIN'])
#     parser.add_argument('--validation_data', 
#                         type=str, 
#                         default=os.environ['SM_CHANNEL_VALIDATION'])
#     parser.add_argument('--test_data',
#                         type=str,
#                         default=os.environ['SM_CHANNEL_TEST'])
#     parser.add_argument('--output_dir',
#                         type=str,
#                         default=os.environ['SM_OUTPUT_DIR'])
#     parser.add_argument('--hosts', 
#                         type=list, 
#                         default=json.loads(os.environ['SM_HOSTS']))
#     parser.add_argument('--current_host', 
#                         type=str, 
#                         default=os.environ['SM_CURRENT_HOST'])    
#     parser.add_argument('--num_gpus', 
#                         type=int, 
#                         default=os.environ['SM_NUM_GPUS'])
#     parser.add_argument('--use_xla',
#                         type=eval,
#                         default=False)
#     parser.add_argument('--use_amp',
#                         type=eval,
#                         default=False)
#     parser.add_argument('--epochs',
#                         type=int,
#                         default=100)
#     parser.add_argument('--learning_rate',
#                         type=float,
#                         default=0.5)
#     parser.add_argument('--enable_tensorboard',
#                         type=eval,
#                         default=False)        
#     parser.add_argument('--output_data_dir', # This is unused
#                         type=str,
#                         default=os.environ['SM_OUTPUT_DATA_DIR'])
    
    # This points to the S3 location - this should not be used by our code
    # We should use /opt/ml/model/ instead
    # parser.add_argument('--model_dir', 
    #                     type=str, 
    #                     default=os.environ['SM_MODEL_DIR'])
     
    args, _ = parser.parse_known_args()
    print("Args:") 
    print(args)
    
    env_var = os.environ 
    print("Environment Variables:") 
    pprint.pprint(dict(env_var), width = 1) 

#    print('SM_TRAINING_ENV {}'.format(env_var['SM_TRAINING_ENV']))
#    sm_training_env_json = json.loads(env_var['SM_TRAINING_ENV'])
#    is_master = sm_training_env_json['is_master']
#    print('is_master {}'.format(is_master))
    
#     train_data = args.train_data
#     print('train_data {}'.format(train_data))
#     validation_data = args.validation_data
#     print('validation_data {}'.format(validation_data))
#     test_data = args.test_data
#     print('test_data {}'.format(test_data))    

    local_model_dir = './model' # os.environ['SM_MODEL_DIR']
    output_dir = './output' # args.output_dir
    print('output_dir {}'.format(output_dir))    
#    hosts = args.hosts
#    print('hosts {}'.format(hosts))    
#    current_host = args.current_host
#    print('current_host {}'.format(current_host))    
#    num_gpus = args.num_gpus
#    print('num_gpus {}'.format(num_gpus))
#    job_name = 'job' # os.environ['SAGEMAKER_JOB_NAME']
#    print('job_name {}'.format(job_name))    

    use_xla = True # args.use_xla
    print('use_xla {}'.format(use_xla))    
    use_amp = True # args.use_amp
    print('use_amp {}'.format(use_amp))    
    epochs = 1 # args.epochs
    print('epochs {}'.format(epochs))    
    learning_rate = 0.5 # args.learning_rate
    print('learning_rate {}'.format(learning_rate))    
    enable_tensorboard = False # args.enable_tensorboard
    print('enable_tensorboard {}'.format(enable_tensorboard))       

    # Determine if PipeMode is enabled 
#     pipe_mode_str = os.environ.get('SM_INPUT_DATA_CONFIG', '')
#     pipe_mode = (pipe_mode_str.find('Pipe') >= 0)
#     print('Using pipe_mode: {}'.format(pipe_mode))
 
    # SavedModel Output
    tensorflow_saved_model_path = os.path.join(local_model_dir, 'tensorflow/saved_model/0')
    os.makedirs(tensorflow_saved_model_path, exist_ok=True)

    # Tensorboard Logs 
    tensorboard_logs_path = os.path.join(local_model_dir, 'tensorboard/')
    os.makedirs(tensorboard_logs_path, exist_ok=True)

    # Commented out due to incompatibility with transformers library (possibly)
    # Set the global precision mixed_precision policy to "mixed_float16"    
#    mixed_precision_policy = 'mixed_float16'
#    print('Mixed precision policy {}'.format(mixed_precision_policy))
#    policy = mixed_precision.Policy(mixed_precision_policy)
#    mixed_precision.set_policy(policy)    
    
    from typing import Dict, Text

    import numpy as np
    import tensorflow as tf

    import tensorflow_datasets as tfds
    import tensorflow_recommenders as tfrs

    distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    with distributed_strategy.scope():
        tf.config.optimizer.set_jit(use_xla)
        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})

        for dirName, subdirList, fileList in os.walk('/opt/ml/input/data/train/'):
            print('Found directory BEFORE: %s' % dirName)
            for fname in fileList:
                print('\t%s' % fname)         

        ratings = tfds.load('movielens/100k-ratings', 
                            download=True,                            
                            data_dir='./tensorflow_datasets/',
                            split="train")
        print('Ratings BEFORE', ratings)
        
        for dirName, subdirList, fileList in os.walk('/opt/ml/input/data/train/'):
            print('Found directory AFTER: %s' % dirName)
            for fname in fileList:
                print('\t%s' % fname)         

        movies = tfds.load('movielens/100k-movies',                           
                           download=True,                           
                           data_dir='./tensorflow_datasets/',
                           split="train")
        print('Movies BEFORE', movies)

        ratings = ratings.map(lambda x: {
            "movie_title": x["movie_title"],
            "user_id": x["user_id"]
        })
        print('Ratings BEFORE', ratings)

        movies = movies.map(lambda x: x["movie_title"])
        print('Movies AFTER', movies)

        user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
        user_ids_vocabulary.adapt(ratings.map(lambda x: x["user_id"]))

        movie_titles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
        movie_titles_vocabulary.adapt(movies)

        user_model = tf.keras.Sequential([
            user_ids_vocabulary,
            tf.keras.layers.Embedding(user_ids_vocabulary.vocab_size(), 128)
        ])

        movie_model = tf.keras.Sequential([
            movie_titles_vocabulary,
            tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), 128)
        ])

        task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
            movies.batch(128).map(movie_model)
          )
        )        

        optimizer = tf.keras.optimizers.Adagrad(learning_rate)
        print('** use_amp {}'.format(use_amp))        
        if use_amp:
            # loss scaling is currently required when using mixed precision
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')

        callbacks = []
        
        if enable_tensorboard:            
            tensorboard_callback = tf.keras.callbacks.TensorBoard(
                                                        log_dir=tensorboard_logs_path)
            print('*** TENSORBOARD CALLBACK {} ***'.format(tensorboard_callback))
            callbacks.append(tensorboard_callback)
  
        print('*** OPTIMIZER {} ***'.format(optimizer))
        
        model = MovieLensModel(user_model, movie_model, task)          
        model.compile(optimizer=optimizer)
        
        model.fit(ratings.batch(4096), epochs=epochs)

        index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
        index.index(movies.batch(100).map(model.movie_model), movies)

        # Must make a prediction before we can save the model.
        _, titles = index(np.array(["42"]))
        print(f"Top 10 recommendations for user 42: {titles[0, :10]}")

        print('Compiled model {}'.format(index))
        print(index.summary())

        # Save the TensorFlow SavedModel for Serving Predictions
        # Note:  We must call index() above before we save().
        #        See https://github.com/tensorflow/tensorflow/issues/31057 for more details.
        print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))   
        index.save(tensorflow_saved_model_path, save_format='tf')
                
        # Copy inference.py and requirements.txt to the code/ directory
        #   Note: This is required for the SageMaker Endpoint to pick them up.
        #         This appears to be hard-coded and must be called code/
        inference_path = os.path.join(local_model_dir, 'code/')
        print('Copying inference source files to {}'.format(inference_path))
        os.makedirs(inference_path, exist_ok=True)               
        os.system('cp inference.py {}'.format(inference_path))
        print(glob(inference_path))        
#        os.system('cp requirements.txt {}/code'.format(inference_path))
    

In [None]:
user_id = "42"

!saved_model_cli run --input_exprs 'input_1=np.array(["$user_id"])' --tag_set serve --signature_def serving_default --dir ./model/tensorflow/saved_model/0