In [None]:
!pip install -q tensorflow-recommenders

from typing import Dict, Text
 
import numpy as np
import pandas as pd
import tensorflow as tf
 
import tensorflow_recommenders as tfrs 

In [2]:
import pandas as pd
 
# load the rating data
 
columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv('../../datasets/recommenders/movielens/ml-100k/u.data', sep='\t', names=columns)
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
 
movies = pd.read_csv('../../datasets/recommenders/movielens/ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movies = movies[['item_id', 'movie title']]
movies.head()

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
# join the ratings with the movies
 
ratings = pd.merge(ratings, movies, on='item_id')
 
 
# keep only moviews with a rating greater than 3
 
ratings = ratings[ratings.rating>3]
 
 
# keep only the user id and the movie title columns
 
ratings = ratings[['movie title', 'user_id']].reset_index(drop=True)
 
ratings

Unnamed: 0,movie title,user_id
0,Kolya (1996),226
1,Kolya (1996),306
2,Kolya (1996),296
3,Kolya (1996),34
4,Kolya (1996),271
...,...,...
55370,Brothers in Trouble (1995),655
55371,Everest (1998),532
55372,Everest (1998),416
55373,"Butcher Boy, The (1998)",655


In [5]:
# save to a csv file
 
ratings.to_csv('../../datasets/recommenders/movielens/ratings.csv', index=False)
movies.to_csv('../../datasets/recommenders/movielens/movies.csv', index=False)

In [7]:
# read the csv files as pandas data frames
ratings_df = pd.read_csv('../../datasets/recommenders/movielens/ratings.csv')
movies_df = pd.read_csv('../../datasets/recommenders/movielens/movies.csv')
 
 
ratings_df.rename(columns = {'movie title': 'movie_title'}, inplace=True)
movies_df.rename(columns = {'movie title': 'movie_title'},  inplace=True)

In [8]:
# convert them to tf datasets
ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df))

In [9]:
# get the first rows of the movies dataset
for m in movies.take(5):
  print(m)

{'item_id': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Toy Story (1995)'>}
{'item_id': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'GoldenEye (1995)'>}
{'item_id': <tf.Tensor: shape=(), dtype=int64, numpy=3>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Four Rooms (1995)'>}
{'item_id': <tf.Tensor: shape=(), dtype=int64, numpy=4>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Get Shorty (1995)'>}
{'item_id': <tf.Tensor: shape=(), dtype=int64, numpy=5>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Copycat (1995)'>}


In [10]:
# get the first rows of the ratings dataset
for r in ratings.take(5):
  print(r)

{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Kolya (1996)'>, 'user_id': <tf.Tensor: shape=(), dtype=int64, numpy=226>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Kolya (1996)'>, 'user_id': <tf.Tensor: shape=(), dtype=int64, numpy=306>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Kolya (1996)'>, 'user_id': <tf.Tensor: shape=(), dtype=int64, numpy=296>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Kolya (1996)'>, 'user_id': <tf.Tensor: shape=(), dtype=int64, numpy=34>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Kolya (1996)'>, 'user_id': <tf.Tensor: shape=(), dtype=int64, numpy=271>}


In [11]:
# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"]
})
movies = movies.map(lambda x: x["movie_title"])

In [12]:
user_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings.map(lambda x: x["user_id"]))
 
 
movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies)

In [13]:
class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.
 
  def __init__(
      self,
      user_model: tf.keras.Model,
      movie_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()
 
    # Set up user and movie representations.
    self.user_model = user_model
    self.movie_model = movie_model
 
    # Set up a retrieval task.
    self.task = task
 
  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.
 
    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.movie_model(features["movie_title"])
 
    return self.task(user_embeddings, movie_embeddings)

In [14]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])
movie_model = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(movie_titles_vocabulary.vocabulary_size(), 64)
])
 
# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    movies.batch(128).map(movie_model)
  )
)

In [15]:
# Create a retrieval model.
model = MovieLensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
 
# Train for 3 epochs.
model.fit(ratings.batch(4096), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c8911d90a0>

In [16]:
# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    movies.batch(100).map(lambda title: (title, model.movie_model(title))))
 
# Get some recommendations.
_, titles = index(np.array([42]))
print(f"Top 10 recommendations for user 42: {titles[0, :10]}")

Top 10 recommendations for user 42: [b'Clean Slate (1994)' b'Associate, The (1996)'
 b'Far From Home: The Adventures of Yellow Dog (1995)'
 b'Black Sheep (1996)' b'Sabrina (1995)' b'Up Close and Personal (1996)'
 b"Preacher's Wife, The (1996)" b'Paper, The (1994)'
 b'Thin Line Between Love and Hate, A (1996)'
 b'Murder in the First (1995)']


In [17]:
# get the users embeddings
users_embdeddings = user_model.weights[1].numpy()
 
# get the mapping of the user ids from the vocabulary
users_idx_name = user_ids_vocabulary.get_vocabulary()
 
# print the shape
users_embdeddings.shape

(943, 64)

In [18]:
# get the movies embeddings
movies_embdeddings = movie_model.weights[1].numpy()
 
# get the mapping of the movie tiles from the vocabulary
movie_idx_name = movie_titles_vocabulary.get_vocabulary()
 
# print the shape of the movies embeddings
movies_embdeddings.shape

(1665, 64)

In [19]:
movie_model.predict(["Star Wars (1977)"])



array([[-0.02771879,  0.41517752, -0.7183174 , -0.08569853,  0.22901823,
        -0.01375752,  0.3248306 , -0.04039951, -0.91948783,  0.33954352,
        -0.36360085,  0.573454  , -0.4454371 , -0.5330843 ,  0.59141564,
        -0.36127123,  0.10168876,  0.0200608 , -0.3292813 , -0.19542597,
         0.76961344, -0.53036   , -0.4655985 ,  0.02680063,  0.52619517,
         0.18460542, -0.43391418,  0.3998139 ,  0.5227519 , -0.45526993,
         0.1482386 ,  0.8776466 ,  0.58037966,  0.12655509,  0.23584375,
        -0.5286832 ,  0.21982975,  0.69950664,  0.19448185, -0.08116973,
         0.4344967 ,  1.0476643 , -0.7323046 , -0.09634478, -0.5594352 ,
         0.39252827, -0.31433088,  0.5441465 ,  0.6291814 , -0.11081322,
         0.02911995, -0.7902099 ,  0.16210036,  0.21492562,  0.7441211 ,
        -0.13226599,  0.15668312, -0.5504327 , -0.18645768, -0.10596377,
         0.06674239, -0.5128635 , -0.9772445 ,  0.6190326 ]],
      dtype=float32)

In [20]:
from sklearn.metrics import pairwise_distances
 
# get the cosine similarity of all pairs
movies_similarity = 1-pairwise_distances(movies_embdeddings, metric='cosine')
 
# get the upper triangle in order to take the unique pairs
movies_similarity = np.triu(movies_similarity)

In [21]:
Movie_A = np.take(movie_idx_name, np.where((movies_similarity>0.8))[0])
Movie_B = np.take(movie_idx_name, np.where((movies_similarity>0.8))[1])
 
similar_movies = pd.DataFrame({'Movie_A':Movie_A, 'Movie_B':Movie_B})
similar_movies.head(100)

Unnamed: 0,Movie_A,Movie_B
0,[UNK],[UNK]
1,Ulee's Gold (1997),Ulee's Gold (1997)
2,That Darn Cat! (1997),That Darn Cat! (1997)
3,"Substance of Fire, The (1996)","Substance of Fire, The (1996)"
4,Sliding Doors (1998),Sliding Doors (1998)
...,...,...
95,Weekend at Bernie's (1989),Weekend at Bernie's (1989)
96,"Wedding Singer, The (1998)","Wedding Singer, The (1998)"
97,"Wedding Gift, The (1994)","Wedding Gift, The (1994)"
98,Wedding Bell Blues (1996),Wedding Bell Blues (1996)


In [22]:
# get the product of users and movies embeddings
product_matrix = np.matmul(users_embdeddings, np.transpose(movies_embdeddings))
 
# get the shape of the product matrix 
product_matrix.shape

(943, 1665)

In [23]:
# score of movies for user 42
user_42_movies = product_matrix[users_idx_name.index(42),:]
 
# return the top 10 movies 
np.take(movie_idx_name, user_42_movies.argsort()[::-1])[0:10]

array(['Clean Slate (1994)', 'Associate, The (1996)',
       'Far From Home: The Adventures of Yellow Dog (1995)',
       'Black Sheep (1996)', 'Sabrina (1995)',
       'Up Close and Personal (1996)', "Preacher's Wife, The (1996)",
       'Paper, The (1994)', 'Thin Line Between Love and Hate, A (1996)',
       'Murder in the First (1995)'], dtype='<U81')

In [24]:
seen_movies = ratings_df.query('user_id==42')['movie_title'].values
 
np.setdiff1d(np.take(movie_idx_name, user_42_movies.argsort()[::-1]), seen_movies, assume_unique=True)[0:10]

array(['Far From Home: The Adventures of Yellow Dog (1995)',
       'Thin Line Between Love and Hate, A (1996)', 'Jack (1996)',
       'In the Line of Duty 2 (1987)', 'First Kid (1996)',
       'To Gillian on Her 37th Birthday (1996)', 'Congo (1995)',
       'Dirty Dancing (1987)', 'Houseguest (1994)',
       'Homeward Bound II: Lost in San Francisco (1996)'], dtype='<U81')

In [25]:
import tempfile
import os
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
  path = os.path.join(tmp, "model")
 
  # Save the index.
  tf.saved_model.save(index, path)
 
  # Load it back; can also be done in TensorFlow Serving.
  loaded = tf.saved_model.load(path)
 
  # Pass a user id in, get top predicted movie titles back.
  scores, titles = loaded([42])
 
  print(f"Recommendations: {titles[0][:10]}")



INFO:tensorflow:Assets written to: C:\Users\rbrid\AppData\Local\Temp\tmprrocy0qk\model\assets


INFO:tensorflow:Assets written to: C:\Users\rbrid\AppData\Local\Temp\tmprrocy0qk\model\assets


Recommendations: [b'Clean Slate (1994)' b'Associate, The (1996)'
 b'Far From Home: The Adventures of Yellow Dog (1995)'
 b'Black Sheep (1996)' b'Sabrina (1995)' b'Up Close and Personal (1996)'
 b"Preacher's Wife, The (1996)" b'Paper, The (1994)'
 b'Thin Line Between Love and Hate, A (1996)'
 b'Murder in the First (1995)']
