reference: cmpe258 course PPT

In [4]:
!pip install tensorflow-ranking
!pip install tensorflow-recommenders

Collecting tensorflow-ranking
  Downloading tensorflow_ranking-0.5.0-py2.py3-none-any.whl (141 kB)
[?25l[K     |██▎                             | 10 kB 21.1 MB/s eta 0:00:01[K     |████▋                           | 20 kB 10.8 MB/s eta 0:00:01[K     |███████                         | 30 kB 9.5 MB/s eta 0:00:01[K     |█████████▎                      | 40 kB 8.5 MB/s eta 0:00:01[K     |███████████▋                    | 51 kB 4.5 MB/s eta 0:00:01[K     |██████████████                  | 61 kB 5.3 MB/s eta 0:00:01[K     |████████████████▎               | 71 kB 5.6 MB/s eta 0:00:01[K     |██████████████████▌             | 81 kB 5.6 MB/s eta 0:00:01[K     |████████████████████▉           | 92 kB 6.2 MB/s eta 0:00:01[K     |███████████████████████▏        | 102 kB 5.2 MB/s eta 0:00:01[K     |█████████████████████████▌      | 112 kB 5.2 MB/s eta 0:00:01[K     |███████████████████████████▉    | 122 kB 5.2 MB/s eta 0:00:01[K     |██████████████████████████████▏ | 133 kB

In [5]:
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

In [7]:
!pip install tensorflow-datasets
!pip install tfds-nightly

Collecting tfds-nightly
  Downloading tfds_nightly-4.5.2.dev202205200045-py3-none-any.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 5.2 MB/s 
Collecting etils[epath-no-tf]
  Downloading etils-0.5.1-py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 4.7 MB/s 
Collecting toml
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: etils, toml, tfds-nightly
Successfully installed etils-0.5.1 tfds-nightly-4.5.2.dev202205200045 toml-0.10.2


In [9]:
import tensorflow_datasets as tfds

In [11]:
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

In [12]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id" : x["user_id"],
    "user_rating": x["user_rating"]
})

In [16]:
movies = movies.map(lambda x: x["movie_title"])

In [18]:
import numpy as np

In [19]:
unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_id = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x:x["user_id"]))))

In [20]:
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)
train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [22]:
# sample 50 lists for each user for the training data
# fo each list we sample 5 movies from the movies the user rated
train = tfrs.examples.movielens.sample_listwise(
    train,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed = 42 
)
test = tfrs.examples.movielens.sample_listwise(
    test,
    num_list_per_user=1,
    num_examples_per_list=5,
    seed = 42 
)

In [24]:
import pprint

In [25]:
for example in train.take(1):
  pprint.pprint(example)

{'movie_title': <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'All About Eve (1950)', b'Much Ado About Nothing (1993)',
       b'Heathers (1989)', b'Real Genius (1985)',
       b'Clear and Present Danger (1994)'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'42'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([3., 4., 2., 3., 5.], dtype=float32)>}


In [27]:
import tensorflow as tf

In [69]:
class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32

    # compute embedding for users
    self.user_embeddings = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
              vocabulary = unique_user_id),
          tf.keras.layers.Embedding(len(unique_user_id) + 2, embedding_dimension)

    ])

    # compute embedding for movies
    self.movie_embeddings = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
              vocabulary = unique_movie_titles),
          tf.keras.layers.Embedding(len(unique_movie_titles) + 2, embedding_dimension)

    ])

    # compute predictions
    self.score_model = tf.keras.Sequential([
          tf.keras.layers.Dense(256, activation="relu"),
          tf.keras.layers.Dense(64, activation="relu"),
          # make prediction at fianl layer
          tf.keras.layers.Dense(1)                                  
    ])

    self.task = tfrs.tasks.Ranking(
        loss=loss,
        metrics=[
            tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
            tf.keras.metrics.RootMeanSquaredError()
        ]
    )

  def call(self, features):
    # convert the id features into embeddings
    user_embeddings = self.user_embeddings(features["user_id"])

    # convert the movie title features into embeddings
    movie_embeddings = self.movie_embeddings(features["movie_title"])

    # reshape the embeddings so that we can pass them to the model
    list_length = features["movie_title"].shape[1]
    user_embeddings_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), 
        [list_length], 
        axis = 1
    )

    concatenated_embeddings = tf.concat([user_embeddings_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("user_rating")

    scores = self(features)

    return self.task(
        labels = labels,
        predictions = tf.squeeze(scores, axis=-1)
    )

In [70]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [71]:
mse_model = RankingModel(tf.keras.losses.MeanSquaredError())
mse_model.compile(optimizer = tf.keras.optimizers.Adagrad(0.1))

In [72]:
mse_model.fit(cached_train, epochs=1, verbose=False)

<keras.callbacks.History at 0x7fd6b8f01310>

In [73]:
mse_model_result = mse_model.evaluate(cached_test, return_dict=True)
print("NDCG of the MSE Model: {:.4f}".format(mse_model_result["ndcg_metric"]))

NDCG of the MSE Model: 0.8453


----

In [77]:
# pairwise hinge loss
hinge_model = RankingModel(tfr.keras.losses.PairwiseHingeLoss())
hinge_model.compile(optimizer = tf.keras.optimizers.Adagrad(0.1))

In [78]:
hinge_model.fit(cached_train, epochs=1, verbose=False)

<keras.callbacks.History at 0x7fd6bcee9d50>

In [79]:
hinge_model_result = hinge_model.evaluate(cached_test, return_dict=True)
print("NDCG of the pairwise hinge loss Model: {:.4f}".format(hinge_model_result["ndcg_metric"]))

NDCG of the pairwise hinge loss Model: 0.8488


----

In [80]:
# list wise loss
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer = tf.keras.optimizers.Adagrad(0.1))
listwise_model.fit(cached_train, epochs=1, verbose=False)

<keras.callbacks.History at 0x7fd6bb2a9910>

In [81]:
listwise_model_result = listwise_model.evaluate(cached_test, return_dict=True)
print("NDCG of the ListMLE Model: {:.4f}".format(listwise_model_result["ndcg_metric"]))

NDCG of the ListMLE Model: 0.8581
