#### Imports

import required libraries

In [1]:
from typing import Dict, Text

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

#### Data pre-processing

In [2]:
# read and merge our data

ratings_df = pd.read_csv('../netflix-project/data/Netflix_Dataset_Rating.csv')
ratings_df.head()

Unnamed: 0,User_ID,Rating,Movie_ID
0,712664,5,3
1,1331154,4,3
2,2632461,3,3
3,44937,5,3
4,656399,4,3


In [3]:
movies_df = pd.read_csv('../netflix-project/data/Netflix_Dataset_Movie.csv')
movies_df.head()

Unnamed: 0,Movie_ID,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [4]:
temp_movies_df = pd.read_csv('../netflix-project/data/Netflix_Dataset_Movie.csv')
ratings_df = ratings_df.merge(temp_movies_df, on='Movie_ID')

In [5]:
ratings_df.head()

Unnamed: 0,User_ID,Rating,Movie_ID,Year,Name
0,712664,5,3,1997,Character
1,1331154,4,3,1997,Character
2,2632461,3,3,1997,Character
3,44937,5,3,1997,Character
4,656399,4,3,1997,Character


In [6]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17337458 entries, 0 to 17337457
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   User_ID   int64 
 1   Rating    int64 
 2   Movie_ID  int64 
 3   Year      int64 
 4   Name      object
dtypes: int64(4), object(1)
memory usage: 661.4+ MB


In [7]:
# convert 'User ID' to prepare for user embedding layer in the model

ratings_df['User_ID'] = ratings_df['User_ID'].astype('str')

ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df[['User_ID', 'Rating', 'Name']]))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df[['Name']]))

In [8]:
ratings = ratings.map(lambda x: {
    "Name": x["Name"],
    "User_ID": x["User_ID"],
    "Rating": x["Rating"]
})

movies = movies.map(lambda x: x["Name"])

In [9]:
print("Total Data: {}".format(len(ratings)))

Total Data: 17337458


In [10]:
# prep for building vocabularies and splitting data into a train and test set

tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["User_ID"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [11]:
print('Unique movie titles: {}'.format(len(unique_movie_titles)))
print('Unique user ids: {}'.format(len(unique_user_ids)))

unique_movie_titles[:10]

Unique movie titles: 17297
Unique user ids: 143458


array([b"'Allo 'Allo!: Series 1", b"'Allo 'Allo!: Series 2",
       b"'Allo 'Allo!: Series 3", b"'N Sync: 'N the Mix",
       b"'N Sync: Live at Madison Square Garden",
       b"'N Sync: Making of the Tour", b"'N Sync: PopOdyssey Live",
       b"'N Sync: Unauthorized Biography", b"'Round Midnight",
       b'... And God Spoke'], dtype=object)

#### Model Implementation

<figure>
    <center> <img src="../netflix-project/two-tower-model.png"   style="width:500px;height:280px;" ></center>
</figure>

The focus on this model would be on two critical-parts:
* optimize for two objectives (retrieval and ranking), thus, having two losses
* share variables between tasks, allowing for transfer learning

The graph shows the architecture of the two-tower model that will be used for this project: retrieval and ranking using the dataset with ratings of movies given by the user. It is a neural network with 2 sub-models using representations for queries('User_ID') and candidates('Name') separately.

With this model, it may use representations learned from abundant task to improve its predictions on the sparse task via transfer learning.

The two-tower model will include the following:
* A user-tower: turns 'User_ID's into user-embeddings (high-dimensional vector representations)
* A movie-tower: turns movie tiles 'Name's into movie-embeddings
* Task (Rating/Ranking): MSE (Loss to predict ratings), RMSE (metrics)
* Task (Retrieval): the Retrieval task object is a wrapper that bundles together the loss function and metric computation. top-k metrics is used

Top-k metrics: given a user and a known watched movie, how highly would the model rank the true movie out of all possible movies?

The model graph also shows the score of the given query-candidate pair, which is the dot product of the output of the two towers.

The 'compute_loss' method describe how the model should be trained, since this is a multi-task model, loss weights will be combined in both tasks, and can be adjusted depending on the weights assigned.

In [12]:
class MovieModel(tfrs.models.Model):
    def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
        # we take the loss weights in the constructor: this allows us to instantiate
        # several model objects with different loss weights
        super().__init__()

        embedding_dimension = 32

        # user and movie models
        self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
            vocabulary=unique_movie_titles, mask_token=None
            ),
            # we add an additional embedding to account for unknown tokens
            tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])

        # A small model to take in user and movie embeddings and predict ratings
        # we can make this as complicated as we want as long as we output a scalar
        # as our prediction
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(1)
        ])

        # the tasks
        self.rating_task: tf.keras.layer.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.movie_model)
            )
        )

        # The loss weights
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        # we pick out the user features and pass them into the user model
        user_embeddings = self.user_model(features["User_ID"])
        # and pick out the movie features and pass them into the movie model
        movie_embeddings = self.movie_model(features["Name"])

        return(
            user_embeddings,
            movie_embeddings,
            # we apply the multi-layered rating model to a concatenation of 
            # user and movie embeddings
            self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1)
            )
        )
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        ratings = features.pop("Rating")
        user_embeddings, movie_embeddings, rating_predictions = self(features)

        # we compute the loss for each task
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions
        )
        retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

        # and combine them using the loss weights
        return (self.rating_weight * rating_loss
                + self.retrieval_weight * retrieval_loss)

#### Fitting and Evaluating

In [13]:
# instantiate the model

model = MovieModel(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.legacy.Adagrad(0.1))

In [14]:
# shuffle, batch, and cache the training and evaluation data

cached_train = train.shuffle(100_000).batch(8_192).cache()
cached_test = test.batch(4_096).cache()

In [15]:
# train the model

model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.393
Ranking RMSE: 1.040


#### Making predictions

'tfrs.layers.factorized_top_k.BruteForce' layer will be used to make predictions.
The BruteForce layer may be slower when serving a model with many possible candidates, in which other layer may be used to speed this up, such as the 'TFRS ScaNN' layer

In [16]:
def predict_movie(user, top_n=5):
    # create a model that takes in raw query ft user
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

    # recommends movies out of the entire movies dataset
    index.index_from_dataset(
        tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
    )

    # get recommendations
    _, titles = index(tf.constant([str(user)]))

    print('Top {} recommendations for user {}:\n'.format(top_n, user))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print('{}. {}'.format(i + 1, title.decode("utf-8")))

def predict_rating(user, movie):
    trained_movie_embeddings, trained_user_embeddings, predicted_rating = model({
        "User_ID": np.array([str(user)]),
        "Name": np.array([movie])
    })
    print("Predicted rating for {}: {}".format(movie, predicted_rating))

In [17]:
predict_movie(169999, 5)

Top 5 recommendations for user 169999:

1. Jade
2. Never Die Alone
3. 7 Seconds
4. The Weather Underground
5. Carandiru


In [18]:
predict_rating(169999, 'Pride and Prejudice')

Predicted rating for Pride and Prejudice: [[3.4194937]]


In [19]:
# let's look at what user 169999 rating history
# to see if they would enjoy the top 10 movie recommendations

ratings_df[ratings_df['User_ID'] == '169999']

Unnamed: 0,User_ID,Rating,Movie_ID,Year,Name
49979,169999,3,28,2002,Lilo and Stitch
108463,169999,5,30,2003,Something's Gotta Give
187719,169999,3,58,1996,Dragonheart
559965,169999,5,191,2003,X2: X-Men United
899753,169999,5,273,2004,Taxi
1036234,169999,5,299,2001,Bridget Jones's Diary
1188597,169999,5,313,2000,Pay It Forward
1323587,169999,4,331,1997,Chasing Amy
1355086,169999,4,334,2005,The Pacifier
1475628,169999,5,361,2004,The Phantom of the Opera: Special Edition
