<h1 style="font-family: Trebuchet MS; padding: 12px; font-size: 30px; color: #081c15; text-align: center; line-height: 1.25;">Recommender System<br><span style="color: #1b4332; font-size: 48px"><b>🎥Movies Recommendation📹</b></span><br><span style="color: #52b788; font-size: 20px">📀Using Deep Learning</span></h1>
<hr>

In [None]:
pip install -q tensorflow-recommenders

# Imports

In [None]:
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_recommenders as tfrs
from collections import Counter
from typing import Dict, Text
from ast import literal_eval
from datetime import datetime
from wordcloud import WordCloud
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

# Code

In [None]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
movies = pd.read_csv('movies_metadata.csv').\
                     drop(['belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'status', 'title', 'video'], axis=1).\
                     drop([19730, 29503, 35587]) # Incorrect data type

movies['id'] = movies['id'].astype('int64')

df = movies.merge(keywords, on='id').\
    merge(credits, on='id')

df['original_language'] = df['original_language'].fillna('')
df['runtime'] = df['runtime'].fillna(0)
df['tagline'] = df['tagline'].fillna('')

df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['release_date'] = pd.to_datetime(df['release_date'])
df['budget'] = df['budget'].astype('float64')
df['popularity'] = df['popularity'].astype('float64')

<a id=2 a/>
<p style="background-color:#368f8b;font-family:Trebuchet MS;font-weight:bold;color:#eff7f6;font-size:40px;text-align:center;border-radius:100px 100px">Recommender System</p>

<a id=4 a/>
<h1 style="font-family: Trebuchet MS; font-size: 25px; color: #3a5a40; text-align: left; "><b>Using Deep Learning</b></h1>

Official documentation: https://www.tensorflow.org/recommenders

Tensorflow comes with a library called TensorFlow Recommenders (TFRS) for building a recommender system. It's built on Keras and aims to have a gentle learning curve while still giving you the flexibility to build complex models.

In [15]:
ratings_df = pd.read_csv('ratings_small.csv')

ratings_df['date'] = ratings_df['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
ratings_df.drop('timestamp', axis=1, inplace=True)

ratings_df = ratings_df.merge(df[['id', 'original_title', 'genres', 'overview']], left_on='movieId',right_on='id', how='left')
ratings_df = ratings_df[~ratings_df['id'].isna()]
ratings_df.drop('id', axis=1, inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

ratings_df.head()

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
0,1,1371,2.5,2009-12-13 20:52:15,Rocky III,"[{'id': 18, 'name': 'Drama'}]","Now the world champion, Rocky Balboa is living..."
1,1,1405,1.0,2009-12-13 20:53:23,Greed,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",Greed is the classic 1924 silent film by Erich...
2,1,2105,4.0,2009-12-13 20:52:19,American Pie,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...","At a high-school party, four friends find that..."
3,1,2193,2.0,2009-12-13 20:53:18,My Tutor,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",High school senior Bobby Chrystal fails his Fr...
4,1,2294,2.0,2009-12-13 20:51:48,Jay and Silent Bob Strike Back,"[{'id': 35, 'name': 'Comedy'}]",When Jay and Silent Bob learn that their comic...


In [16]:
movies_df = df[['id', 'original_title']]
movies_df.rename(columns={'id':'movieId'}, inplace=True)
movies_df.head()

Unnamed: 0,movieId,original_title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [17]:
ratings_df['userId'] = ratings_df['userId'].astype(str)

ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df[['userId', 'original_title', 'rating']]))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df[['original_title']]))

ratings = ratings.map(lambda x: {
    "original_title": x["original_title"],
    "userId": x["userId"],
    "rating": float(x["rating"])
})

movies = movies.map(lambda x: x["original_title"])

In [18]:
print('Total Data: {}'.format(len(ratings)))

tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = ratings.take(35_000)
test = ratings.skip(35_000).take(8_188)

Total Data: 44875


In [19]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000).map(lambda x: x["userId"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

print('Unique Movies: {}'.format(len(unique_movie_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

Unique Movies: 42373
Unique users: 671


In [20]:
class MovieModel(tfrs.models.Model):

    def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
        # We take the loss weights in the constructor: this allows us to instantiate
        # several model objects with different loss weights.

        super().__init__()

        embedding_dimension = 64

        # User and movie models.
        self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
            vocabulary=unique_movie_titles, mask_token=None),
          tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
          tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])

        # A small model to take in user and movie embeddings and predict ratings.
        # We can make this as complicated as we want as long as we output a scalar
        # as our prediction.
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(1),
        ])

        # The tasks.
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.movie_model)
            )
        )

        # The loss weights.
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["userId"])
        # And pick out the movie features and pass them into the movie model.
        movie_embeddings = self.movie_model(features["original_title"])

        return (
            user_embeddings,
            movie_embeddings,
            # We apply the multi-layered rating model to a concatentation of
            # user and movie embeddings.
            self.rating_model(
                tf.concat([user_embeddings, movie_embeddings], axis=1)
            ),
        )


    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        ratings = features.pop("rating")

        user_embeddings, movie_embeddings, rating_predictions = self(features)

        # We compute the loss for each task.
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

        # And combine them using the loss weights.
        return (self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss)

In [21]:
model = MovieModel(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

cached_train = train.shuffle(100_000).batch(1_000).cache()
cached_test = test.batch(1_000).cache()

model.fit(cached_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x2d776a37280>

In [22]:
metrics = model.evaluate(cached_test, return_dict=True)

print(f"\nRetrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}")


Retrieval top-100 accuracy: 0.050
Ranking RMSE: 1.050


In [23]:
def predict_movie(user, top_n=3):
    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    # recommends movies out of the entire movies dataset.
    index.index_from_dataset(
      tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
    )

    # Get recommendations.
    _, titles = index(tf.constant([str(user)]))
    
    print('Top {} recommendations for user {}:\n'.format(top_n, user))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print('{}. {}'.format(i+1, title.decode("utf-8")))

def predict_rating(user, movie):
    trained_movie_embeddings, trained_user_embeddings, predicted_rating = model({
          "userId": np.array([str(user)]),
          "original_title": np.array([movie])
      })
    print("Predicted rating for {}: {}".format(movie, predicted_rating.numpy()[0][0]))

# Prediction

### UserId: 567

In [29]:
predict_movie(567, 10)

Top 10 recommendations for user 567:

1. What's New Pussycat?
2. The Last Samurai
3. Cat on a Hot Tin Roof
4. Don Q Son of Zorro
5. The Killing
6. Infinity
7. Gyakufunsha kazoku
8. The In-Laws
9. The In-Laws
10. Grizzly Man


In [28]:
predict_rating(567,'Minions')

Predicted rating for Minions: 2.9398865699768066


So, le't examine **User 123** from historical data

In [30]:
ratings_df[ratings_df['userId'] == '567']

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
38065,567,260,3.5,2015-07-13 15:35:18,The 39 Steps,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...","While on vacation in London, Canadian Richard ..."
38066,567,318,5.0,2015-07-13 15:35:11,The Million Dollar Hotel,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",The Million Dollar Hotel starts with a jump fr...
38067,567,593,4.0,2015-07-13 15:36:09,Солярис,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...",Ground control has been receiving strange tran...
38068,567,750,4.0,2015-07-13 15:36:35,Murder She Said,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",Miss Marple believes she’s seen a murder in a ...
38069,567,858,5.0,2015-07-13 15:34:48,Sleepless in Seattle,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",A young boy who tries to set his dad up on a d...
38070,567,924,4.0,2015-07-13 15:38:51,Dawn of the Dead,"[{'id': 14, 'name': 'Fantasy'}, {'id': 27, 'na...",A group of surviving people take refuge in a s...
38071,567,1089,5.0,2015-07-13 15:36:18,Point Break,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...","In the coastal town of Los Angeles, a gang of ..."
38072,567,1213,4.5,2015-07-13 15:36:14,The Talented Mr. Ripley,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...",Tom Ripley is a calculating young man who beli...
38073,567,1265,5.0,2015-07-13 15:37:30,Bridge to Terabithia,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",Jesse Aarons trained all summer to become the ...
38074,567,1732,5.0,2015-07-13 15:38:40,The Prisoner of Zenda,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",Anthony Hope's classic tale gets a decidedly '...


In [32]:
# Get meta data for predicted movie
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

# Get recommendations.
_, titles = index(tf.constant(['567']))
pred_movies = pd.DataFrame({'original_title': [i.decode('utf-8') for i in titles[0,:5].numpy()]})

pred_df = pred_movies.merge(ratings_df[['original_title', 'genres', 'overview']], on='original_title', how='left')
pred_df = pred_df[~pred_df['original_title'].duplicated()]
pred_df.reset_index(drop=True, inplace=True)
pred_df.index = np.arange(1, len(pred_df)+1)

pred_df

Unnamed: 0,original_title,genres,overview
1,What's New Pussycat?,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",A playboy who refuses to give up his hedonisti...
2,The Last Samurai,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",Nathan Algren is an American hired to instruct...
3,Cat on a Hot Tin Roof,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...","Brick, an alcoholic ex-football player, drinks..."
4,Don Q Son of Zorro,"[{'id': 37, 'name': 'Western'}, {'id': 12, 'na...",Returning to the legend that inspired his firs...
5,The Killing,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",The Killing was Stanley Kubrick’s first film w...


At a glance, we can see if **User 567** love watching Drama movies most of the time. He/She also gives a good rating for that genre. In our recommendation, We give 5 more Drama movies that we expect him/her to love the movies in a similar way with the previous watched movies.

In our dataset, we don't see any Animation movies that have been watched by **User 567**. So, it's not a surprise if the estimated rating for Minions is quite low

### User: 99

In [34]:
predict_movie(99, 10)

Top 10 recommendations for user 99:

1. Bonjour Tristesse
2. Shall We Dance?
3. Double Dragon
4. Dracula vs. Frankenstein
5. The Final Cut
6. The Final Cut
7. Crank
8. 밀양
9. Zodiac
10. Zodiac


In [35]:
predict_rating(99,'Toy Story')

Predicted rating for Toy Story: 2.889596462249756


**Lets examine User:99 from historical data**

In [36]:
ratings_df[ratings_df['userId'] == '99']

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
6589,99,2,2.0,2000-02-19 16:39:18,Ariel,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",Taisto Kasurinen is a Finnish coal miner whose...
6590,99,17,3.0,1999-09-29 00:36:39,The Dark,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",Adèle and her daughter Sarah are traveling on ...
6591,99,28,3.0,1999-09-29 00:36:39,Apocalypse Now,"[{'id': 18, 'name': 'Drama'}, {'id': 10752, 'n...","At the height of the Vietnam war, Captain Benj..."
6592,99,105,3.0,1999-09-29 00:43:27,Back to the Future,"[{'id': 12, 'name': 'Adventure'}, {'id': 35, '...",Eighties teenager Marty McFly is accidentally ...
6593,99,110,3.0,2000-01-10 09:51:40,Trois couleurs : Rouge,"[{'id': 18, 'name': 'Drama'}, {'id': 9648, 'na...",Red This is the third film from the trilogy by...
...,...,...,...,...,...,...,...
6689,99,4641,5.0,2002-03-01 20:57:04,Read It and Weep,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",A young girl turns into an A-List celebrity ov...
6690,99,4973,5.0,2002-03-01 20:56:51,Sous le Sable,"[{'id': 18, 'name': 'Drama'}, {'id': 9648, 'na...","When her husband goes missing at the beach, a ..."
6691,99,4993,5.0,2003-02-09 04:35:19,5 Card Stud,"[{'id': 28, 'name': 'Action'}, {'id': 37, 'nam...",The players in an ongoing poker game are being...
6692,99,4995,5.0,2002-03-01 20:54:22,Boogie Nights,"[{'id': 18, 'name': 'Drama'}]","Set in 1977, back when sex was safe, pleasure ..."


In [37]:
# Get meta data for predicted movie
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

# Get recommendations.
_, titles = index(tf.constant(['99']))
pred_movies = pd.DataFrame({'original_title': [i.decode('utf-8') for i in titles[0,:5].numpy()]})

pred_df = pred_movies.merge(ratings_df[['original_title', 'genres', 'overview']], on='original_title', how='left')
pred_df = pred_df[~pred_df['original_title'].duplicated()]
pred_df.reset_index(drop=True, inplace=True)
pred_df.index = np.arange(1, len(pred_df)+1)

pred_df

Unnamed: 0,original_title,genres,overview
1,Bonjour Tristesse,"[{'id': 18, 'name': 'Drama'}]","Cecile, decadent young girl who lives with her..."
2,Shall We Dance?,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...","Upon first sight of a beautiful instructor, a ..."
3,Double Dragon,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",Set 15 years in the future in post-earthquake ...
4,Dracula vs. Frankenstein,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",Dracula conspires with a mad doctor to resurre...
5,The Final Cut,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",Mysterious bomber is planting explosive device...


At a glance, we can see if **User 99** love watching Drama, Action genre movies most of the time. He/She also gives a good rating for that genre. In our recommendation, We give 5 more Drama movies that we expect him/her to love the movies in a similar way with the previous watched movies.

In our dataset, we don't see any Animation movies that have been watched by **User 99**. So, it's not a surprise if the estimated rating for Toy Story is quite low

<h1 style="font-family: Trebuchet MS; font-size: 60px; color: #1b4332; text-align: center;"><b>THE END</b></h1>