In [1]:
from keras.utils import Sequence
from tensorflow_datasets.core.features import FeaturesDict, ClassLabel
import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import tensorflow_recommenders as tfrs

In [2]:
dataset_movies = "movielens/100k-movies"
movies = tfds.load(dataset_movies, split="train")
dataset_ratings = "movielens/100k-ratings"
ratings = tfds.load(dataset_ratings, split="train")

In [3]:
ratings = ratings.map(lambda x: {"movie_title": x["movie_title"], "user_id": x["user_id"]})

In [4]:
movies = movies.map(lambda x: x["movie_title"])

In [5]:
for i in ratings:
    print(i["user_id"] + "," + i["movie_title"])

2022-08-24 18:34:53.323543: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


tf.Tensor(b"138,One Flew Over the Cuckoo's Nest (1975)", shape=(), dtype=string)
tf.Tensor(b'92,Strictly Ballroom (1992)', shape=(), dtype=string)
tf.Tensor(b'301,Very Brady Sequel, A (1996)', shape=(), dtype=string)
tf.Tensor(b'60,Pulp Fiction (1994)', shape=(), dtype=string)
tf.Tensor(b'197,Scream 2 (1997)', shape=(), dtype=string)
tf.Tensor(b'601,Crash (1996)', shape=(), dtype=string)
tf.Tensor(b'710,Aladdin (1992)', shape=(), dtype=string)
tf.Tensor(b'833,True Romance (1993)', shape=(), dtype=string)
tf.Tensor(b'916,Bob Roberts (1992)', shape=(), dtype=string)
tf.Tensor(b'940,Starship Troopers (1997)', shape=(), dtype=string)
tf.Tensor(b'611,Sphere (1998)', shape=(), dtype=string)
tf.Tensor(b'707,Tin Men (1987)', shape=(), dtype=string)
tf.Tensor(b'699,Arrival, The (1996)', shape=(), dtype=string)
tf.Tensor(b'16,Beavis and Butt-head Do America (1996)', shape=(), dtype=string)
tf.Tensor(b'314,Last Dance (1996)', shape=(), dtype=string)
tf.Tensor(b'217,In the Line of Fire (1993)', sh

In [6]:
tf.random.set_seed(42)
shufffled = ratings.shuffle(100000, seed=42, reshuffle_each_iteration=False)
train = shufffled.take(80000)
test = shufffled.skip(80000).take(20000)

In [7]:
movie_titles = movies.batch(1000)
user_ids = ratings.batch(1000000).map(lambda x: x['user_id'])

In [8]:
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))

In [9]:
print(type(movie_titles))
print(list(movie_titles))
# for i in movie_titles:
#     print(i)

<class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>
[<tf.Tensor: shape=(1000,), dtype=string, numpy=
array([b'You So Crazy (1994)', b'Love Is All There Is (1996)',
       b'Fly Away Home (1996)', b'In the Line of Duty 2 (1987)',
       b'Niagara, Niagara (1997)',
       b"Young Poisoner's Handbook, The (1995)",
       b'Age of Innocence, The (1993)', b'Flirt (1995)', b'Frisk (1995)',
       b'unknown', b'Girls Town (1996)', b'Stonewall (1995)',
       b'African Queen, The (1951)', b'Bloody Child, The (1996)',
       b'Executive Decision (1996)', b'Batman Returns (1992)',
       b'Canadian Bacon (1994)', b'FairyTale: A True Story (1997)',
       b'Body Snatcher, The (1945)', b'Sabrina (1954)', b'Sphere (1998)',
       b'Magic Hour, The (1998)', b'Raise the Red Lantern (1991)',
       b'Homeward Bound II: Lost in San Francisco (1996)',
       b'Grosse Pointe Blank (1997)', b"Smilla's Sense of Snow (1997)",
       b'Princess Caraboo (1994)', b'Wings of Desire (1987)',
       b'C

In [10]:
unique_user_ids = np.unique(np.concatenate(list(user_ids)))
print(type(user_ids))
print(list(user_ids))

<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>
[<tf.Tensor: shape=(100000,), dtype=string, numpy=array([b'138', b'92', b'301', ..., b'262', b'911', b'276'], dtype=object)>]


In [None]:
len(np.concatenate(list(user_ids)))
# temp = tf.reshape(list(user_ids), [10000, 5, 2])
# print(temp)


## Model

In [13]:
embedding_dimension = 32

In [14]:
user_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None
        ),
        tf.keras.layers.Embedding(
            len(unique_user_ids) + 1, embedding_dimension
        )
    ]
)

In [15]:
movie_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(
            vocabulary=unique_movie_titles, mask_token=None
        ),
        tf.keras.layers.Embedding(
            len(unique_movie_titles) + 1, embedding_dimension
        )
    ]
)

In [16]:
metrics = tfrs.metrics.FactorizedTopK(candidates=movies.batch(128).map(movie_model))

In [17]:
task = tfrs.tasks.Retrieval(metrics=metrics)

In [19]:
class MovieLensModel(tfrs.Model):

    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model = movie_model
        self.user_model = user_model
        self.task = task

    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features["user_id"])
        positive_movie_embeddings = self.movie_model(features["movie_title"])
        return self.task(user_embeddings, positive_movie_embeddings)

In [20]:
model = MovieLensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [21]:
cached_train = train.shuffle(100000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [24]:
model.fit(cached_train, epochs=3)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x16da5aa70>

In [25]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.00019999999494757503,
 'factorized_top_k/top_5_categorical_accuracy': 0.004000000189989805,
 'factorized_top_k/top_10_categorical_accuracy': 0.011149999685585499,
 'factorized_top_k/top_50_categorical_accuracy': 0.08715000003576279,
 'factorized_top_k/top_100_categorical_accuracy': 0.18490000069141388,
 'loss': 28654.71484375,
 'regularization_loss': 0,
 'total_loss': 28654.71484375}