<a href="https://colab.research.google.com/github/efcor/tf-sandbox/blob/main/model-attempt-1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import tensorflow_datasets as tfds

ratings = tfds.load("movielens/100k-ratings", split="train")

ratings = ratings.map(lambda x: {
  "movie_title": x["movie_title"],
  "user_id": x["user_id"],
  "user_rating": x["user_rating"]
})

Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-ratings/incomplete.BTYPNT_0.1.1/movielens-train.tfrecord*..…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.


In [5]:
import tensorflow as tf
import numpy as np

tf.random.set_seed(524)
shuffled = ratings.shuffle(100_000, seed=524, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

movie_titles = ratings.batch(100_000).map(lambda x: x["movie_title"])
user_ids = ratings.batch(100_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [49]:
embedding_dimension = 32

# Compute the embeddings for users
user_embeddings = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

# Compute the embeddings for movies
movie_embeddings = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_movie_titles, mask_token=None),
    tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

# Make a ratings model by using a few dense layers to predict ratings
ratings = tf.keras.Sequential([
  # Learn multiple dense layers.
  tf.keras.layers.Dense(256, activation="relu"),
  tf.keras.layers.Dense(64, activation="relu"),
  # Make rating predictions in the final layer.
  tf.keras.layers.Dense(1)
])

# See what rating it outputs for user 42 for one flew over the cuckoo's nest.
# (haven't done any training yet)
user_input = tf.constant(["42"])
movie_input = tf.constant(["One Flew Over the Cuckoo's Nest (1975)"])
user_embedding = user_embeddings(user_input)
movie_embedding = movie_embeddings(movie_input)

# Concat the user and movies into a single vector, and then pass that vector
# through the ratings model
ratings(tf.concat([user_embedding, movie_embedding], axis=1))



<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.0215638]], dtype=float32)>