In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs
from model import MovielensModel


ratings = tfds.load('movielens/100k-ratings', split="train")
movies = tfds.load('movielens/100k-movies', split="train")

[1mDownloading and preparing dataset movielens/100k-ratings/0.1.0 (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


Dl Completed...: |          | 0/0 [00:00<?, ? url/s]

Dl Size...: |          | 0/0 [00:00<?, ? MiB/s]

Extraction completed...: |          | 0/0 [00:00<?, ? file/s]






  0%|          | 0/1 [00:00<?, ? splits/s]

  0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0.incomplete0FCPVO/movielens-train.tfrecord


  0%|          | 0/100000 [00:00<?, ? examples/s]

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m
[1mDownloading and preparing dataset movielens/100k-movies/0.1.0 (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /root/tensorflow_datasets/movielens/100k-movies/0.1.0...[0m


Dl Completed...: |          | 0/0 [00:00<?, ? url/s]

Dl Size...: |          | 0/0 [00:00<?, ? MiB/s]

Extraction completed...: |          | 0/0 [00:00<?, ? file/s]






  0%|          | 0/1 [00:00<?, ? splits/s]

  0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/movielens/100k-movies/0.1.0.incompleteBU84JQ/movielens-train.tfrecord


  0%|          | 0/1682 [00:00<?, ? examples/s]

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-movies/0.1.0. Subsequent calls will reuse this data.[0m


In [16]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

from typing import Dict, Text


class MovielensModel(tfrs.models.Model):

    def __init__(self, movies, unique_user_ids, unique_movie_titles, rating_weight: float, retrieval_weight: float) -> None:
        # We take the loss weights in the constructor: this allows us to instantiate
        # several model objects with different loss weights.

        super().__init__()

        embedding_dimension = 32

        # User and movie models.
        self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.StringLookup(
            vocabulary=unique_movie_titles, mask_token=None),
        tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])

        # A small model to take in user and movie embeddings and predict ratings.
        # We can make this as complicated as we want as long as we output a scalar
        # as our prediction.
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(1),
        ])

        # The tasks.
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.movie_model)
            )
        )

        # The loss weights.
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_id"])
        # And pick out the movie features and pass them into the movie model.
        movie_embeddings = self.movie_model(features["movie_title"])

        return (
            user_embeddings,
            movie_embeddings,
            # We apply the multi-layered rating model to a concatentation of
            # user and movie embeddings.
            self.rating_model(
                tf.concat([user_embeddings, movie_embeddings], axis=1)
            ),
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

        ratings = features.pop("user_rating")

        user_embeddings, movie_embeddings, rating_predictions = self(features)

        # We compute the loss for each task.
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        print('\n')
        print('user_embeddings')
        print(user_embeddings)
        print('movie_embeddings')
        print(movie_embeddings)
        print('\n')
        retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

        # And combine them using the loss weights.
        return (self.rating_weight * rating_loss

                + self.retrieval_weight * retrieval_loss)


In [3]:
# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_rating": x["user_rating"],
})
movies = movies.map(lambda x: x["movie_title"])

# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)


In [None]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [7]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [18]:
model = MovielensModel(movies, unique_user_ids, unique_movie_titles, rating_weight=0.5, retrieval_weight=0.5)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
#print(f"Ranking RMSE: {metrics['root_mean_squared_error+']:.3f}.")

Epoch 1/3


user_embeddings
Tensor("movielens_model_3/sequential_10/embedding_7/embedding_lookup/Identity:0", shape=(None, 32), dtype=float32)
movie_embeddings
Tensor("movielens_model_3/sequential_9/embedding_6/embedding_lookup/Identity_1:0", shape=(None, 32), dtype=float32)












user_embeddings
Tensor("movielens_model_3/sequential_10/embedding_7/embedding_lookup/Identity:0", shape=(None, 32), dtype=float32)
movie_embeddings
Tensor("movielens_model_3/sequential_9/embedding_6/embedding_lookup/Identity_1:0", shape=(None, 32), dtype=float32)










Epoch 2/3
Epoch 3/3
 2/10 [=====>........................] - ETA: 12s - root_mean_squared_error: 1.1503 - factorized_top_k/top_1_categorical_accuracy: 0.0029 - factorized_top_k/top_5_categorical_accuracy: 0.0207 - factorized_top_k/top_10_categorical_accuracy: 0.0437 - factorized_top_k/top_50_categorical_accuracy: 0.1825 - factorized_top_k/top_100_categorical_accuracy: 0.3148 - loss: 35246.1602 - regularization_loss: 0.0000e+00 - total_loss: 35246.1602

KeyboardInterrupt: 

In [15]:
model = MovielensModel(movies, unique_user_ids, unique_movie_titles, rating_weight=0.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
#print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

model = MovielensModel(movies, unique_user_ids, unique_movie_titles, rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
#print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.205.
Epoch 1/3
















Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.235.
Epoch 1/3
















Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.235.


In [20]:
test.__dict__

{'_input_dataset': <SkipDataset shapes: {movie_title: (), user_id: (), user_rating: ()}, types: {movie_title: tf.string, user_id: tf.string, user_rating: tf.float32}>,
 '_count': <tf.Tensor: shape=(), dtype=int64, numpy=20000>,
 '_variant_tensor_attr': <tf.Tensor: shape=(), dtype=variant, numpy=<unprintable>>,
 '_self_setattr_tracking': True,
 '_self_unconditional_checkpoint_dependencies': [TrackableReference(name='_variant_tracker', ref=<tensorflow.python.data.ops.dataset_ops._VariantTracker object at 0x7fe3f0351860>)],
 '_self_unconditional_dependency_names': {'_variant_tracker': <tensorflow.python.data.ops.dataset_ops._VariantTracker at 0x7fe3f0351860>},
 '_self_unconditional_deferred_dependencies': {},
 '_self_update_uid': -1,
 '_self_name_based_restores': set(),
 '_self_saveable_object_factories': {},
 '_variant_tracker': <tensorflow.python.data.ops.dataset_ops._VariantTracker at 0x7fe3f0351860>,
 '_graph_attr': <tensorflow.python.framework.ops.Graph at 0x7fe470350198>,
 '_options