<a href="https://colab.research.google.com/github/fatih260/ML-DL-Projects/blob/main/Content_based_recommender_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [None]:
import os
import tempfile

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [None]:
ratings = tfds.load("movielens/100k-ratings", split="train")
_movies = tfds.load("movielens/100k-movies", split="train")

In [None]:
for record in ratings.take(5):
  print(record)


{'bucketized_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=45.0>, 'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([7])>, 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'357'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b"One Flew Over the Cuckoo's Nest (1975)">, 'raw_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=46.0>, 'timestamp': <tf.Tensor: shape=(), dtype=int64, numpy=879024327>, 'user_gender': <tf.Tensor: shape=(), dtype=bool, numpy=True>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'138'>, 'user_occupation_label': <tf.Tensor: shape=(), dtype=int64, numpy=4>, 'user_occupation_text': <tf.Tensor: shape=(), dtype=string, numpy=b'doctor'>, 'user_rating': <tf.Tensor: shape=(), dtype=float32, numpy=4.0>, 'user_zip_code': <tf.Tensor: shape=(), dtype=string, numpy=b'53211'>}
{'bucketized_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=25.0>, 'movie_genres': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([ 4, 14])>, '

In [None]:
#ratings_df = tfds.as_dataframe(ratings)
#ratings_df.head()

In [None]:
ratings = ratings.map(lambda x: {
    "movie_genres": x["movie_genres"],
    "movie_title": x["movie_title"],
    "user_gender": x["user_gender"],
    "bucketized_user_age": x["bucketized_user_age"],
    "user_occupation_label": x["user_occupation_label"],
    "timestamp": x["timestamp"]
})

for record in ratings.take(5):
  print(record)
#movies = movies.map(lambda x: {
#    "movie_title": x["movie_title"],
#    "movie_genres": x["movie_genres"]
#})
movies = _movies.map(lambda x: x["movie_title"])
genres = _movies.map(lambda x: x["movie_genres"])

for movie in movies.take(5):
  print(movie)

{'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([7])>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b"One Flew Over the Cuckoo's Nest (1975)">, 'user_gender': <tf.Tensor: shape=(), dtype=bool, numpy=True>, 'bucketized_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=45.0>, 'user_occupation_label': <tf.Tensor: shape=(), dtype=int64, numpy=4>, 'timestamp': <tf.Tensor: shape=(), dtype=int64, numpy=879024327>}
{'movie_genres': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([ 4, 14])>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Strictly Ballroom (1992)'>, 'user_gender': <tf.Tensor: shape=(), dtype=bool, numpy=True>, 'bucketized_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=25.0>, 'user_occupation_label': <tf.Tensor: shape=(), dtype=int64, numpy=5>, 'timestamp': <tf.Tensor: shape=(), dtype=int64, numpy=875654590>}
{'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>, 'movie_title': <tf.Tensor: shape=(), dtype=str

In [None]:
timestamps = np.concatenate(list(ratings.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)
len(timestamp_buckets)

1000

In [None]:
try:
    unique_movie_genres = np.unique(np.concatenate(list(genres.batch(1000))))
    unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
    unique_user_gender = np.unique(np.concatenate(list(ratings.batch(1000).map(
        lambda x: x["user_gender"]))))
    unique_bucketized_user_age = np.unique(np.concatenate(list(ratings.batch(1000).map(
        lambda x: x["bucketized_user_age"]))))
    unique_user_occupation_label = np.unique(np.concatenate(list(ratings.batch(1000).map(
        lambda x: x["user_occupation_label"]))))
except Exception as e:
    print("Error:", e)

Error: {{function_node __wrapped__IteratorGetNext_output_types_1_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cannot add tensor to the batch: number of elements does not match. Shapes are: [tensor]: [2], [batch]: [1] [Op:IteratorGetNext]


In [None]:
class UserModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.age_embedding = tf.keras.Sequential([
        tf.keras.layers.IntegerLookup(
            vocabulary=unique_bucketized_user_age, mask_token=None),
        tf.keras.layers.Embedding(len(unique_bucketized_user_age) + 1, 32),
    ])

    self.occupation_embedding = tf.keras.Sequential([
        tf.keras.layers.IntegerLookup(
            vocabulary=unique_user_occupation_label, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_occupation_label) + 1, 32),
    ])

    self.gender_embedding = tf.keras.Sequential([
        tf.keras.layers.IntegerLookup(
            vocabulary=unique_user_gender, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_gender) + 1, 32),
    ])

    self.timestamp_embedding = tf.keras.Sequential([
        tf.keras.layers.Discretization(timestamp_buckets.tolist()),
        tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
    ])
    self.normalized_timestamp = tf.keras.layers.Normalization(
        axis=None
    )

    self.normalized_timestamp.adapt(timestamps)

  def call(self, inputs):
    # Take the input dictionary, pass it through each input layer,
    # and concatenate the result.
    return tf.concat([
        self.age_embedding(inputs["bucketized_user_age"]),
        self.occupation_embedding(inputs["user_occupation_label"]),
        self.gender_embedding(inputs["user_gender"]),
        self.timestamp_embedding(inputs["timestamp"]),
        tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
    ], axis=1)


In [None]:
class QueryModel(tf.keras.Model):
  """Model for encoding user queries."""

  def __init__(self, layer_sizes):
    """Model for encoding user queries.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    # We first use the user model for generating embeddings.
    self.embedding_model = UserModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [None]:
class MovieModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_movie_titles,mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
    ])

    self.title_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

    self.title_text_embedding = tf.keras.Sequential([
      self.title_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.genre_embedding = tf.keras.Sequential([
        tf.keras.layers.IntegerLookup(
            vocabulary=unique_movie_genres, mask_token=None),
        tf.keras.layers.Embedding(len(unique_movie_genres) + 1, 32),
    ])

    self.title_vectorizer.adapt(movies)

  def call(self, titles):
    return tf.concat([
        self.title_embedding(titles),
        self.title_text_embedding(titles),
        self.genre_embedding(genres),
    ], axis=1)

In [None]:
class CandidateModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = MovieModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [None]:
class MovielensModel(tfrs.models.Model):

  def __init__(self, layer_sizes):
    super().__init__()
    self.query_model = QueryModel(layer_sizes)
    self.candidate_model = CandidateModel(layer_sizes)
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):

    query_embeddings = self.query_model({
        "bucketized_user_age": features["bucketized_user_age"],
        "user_occupation_label": features["user_occupation_label"],
        "user_gender": features["user_gender"],
        "timestamp": features["timestamp"],
    })
    movie_embeddings = self.candidate_model(features["movie_title"])

    return self.task(
        query_embeddings, movie_embeddings, compute_metrics=not training)

In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [None]:
num_epochs = 50
model = MovielensModel([64, 32])
#model = MovielensModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

Top-100 accuracy: 0.16.


In [None]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.candidate_model)))
)


_, titles = index({
    "bucketized_user_age": np.array([50]),
    "user_occupation_label": np.array([4]),
    "user_gender": np.array([False]),
    "timestamp": np.array([879024327])},
    k=50
)

In [None]:
titles[0].numpy()

array([b'Secrets & Lies (1996)', b'Roman Holiday (1953)',
       b'Sum of Us, The (1994)', b'Cold Comfort Farm (1995)',
       b'Love in the Afternoon (1957)', b'Postino, Il (1994)',
       b"Star Maker, The (Uomo delle stelle, L') (1995)", b'Emma (1996)',
       b"Antonia's Line (1995)", b'Spitfire Grill, The (1996)',
       b'Big Night (1996)', b'Jane Eyre (1996)', b'Citizen Ruth (1996)',
       b'Lone Star (1996)', b'A Chef in Love (1996)',
       b'Cat on a Hot Tin Roof (1958)', b'Sense and Sensibility (1995)',
       b'Love & Human Remains (1993)', b'To Catch a Thief (1955)',
       b'Murder in the First (1995)', b'Little Women (1994)',
       b'Eat Drink Man Woman (1994)', b'Fargo (1996)',
       b'Godfather, The (1972)', b'Crucible, The (1996)',
       b'Leaving Las Vegas (1995)', b'Mighty Aphrodite (1995)',
       b'Band Wagon, The (1953)',
       b'Treasure of the Sierra Madre, The (1948)',
       b'Unhook the Stars (1996)', b'Laura (1944)',
       b'Ruling Class, The (1972)',