In [2]:
import pandas as pd
pd.set_option('display.max_columns', 20)

movie = pd.read_csv('../input/movielens-20m-dataset/movie.csv')
rating = pd.read_csv('../input/movielens-20m-dataset/rating.csv')
df = movie.merge(rating, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [3]:
print(df.shape)

df = df.sample(frac=0.1, random_state=1)

(20000797, 6)


In [4]:
df.shape

(2000080, 6)

## Clean up unpopular movie 

In [5]:
comment_counts = pd.DataFrame(df["title"].value_counts())
comment_counts

Unnamed: 0,title
Pulp Fiction (1994),6844
Forrest Gump (1994),6694
"Silence of the Lambs, The (1991)",6267
"Shawshank Redemption, The (1994)",6237
Jurassic Park (1993),5961
...,...
Camille (2007),1
Madonna's Pig (2011),1
W Delta Z (a.k.a. The Killing Gene) (2007),1
Tungsten (2011),1


In [6]:
len(comment_counts[comment_counts['title'] == 1]), len(comment_counts[comment_counts['title'] > 1]), len(comment_counts[comment_counts['title'] > 100])

(3680, 14006, 3138)

In [7]:
rare_movies = comment_counts[comment_counts["title"] <= 100].index

common_movies = df[~df["title"].isin(rare_movies)]
common_movies.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
9758582,2099,Song of the South (1946),Adventure|Animation|Children|Musical,37530.0,2.0,2000-11-20 00:40:34
17191398,7701,Look Who's Talking Too (1990),Comedy|Romance,104380.0,2.5,2005-11-27 21:52:18
7711912,1387,Jaws (1975),Action|Horror,118032.0,3.5,2005-05-09 21:46:26
18447286,47629,"Queen, The (2006)",Drama,5329.0,3.5,2011-08-17 22:03:14
1296908,185,"Net, The (1995)",Action|Crime|Thriller,1165.0,3.0,2006-04-11 17:44:46


In [8]:
common_movies.shape

(1775729, 6)

In [9]:
common_movies["title"].nunique()

3138

In [10]:
 user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

In [11]:
user_movie_df.head(20)

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zelig (1983),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
6.0,,,,,,,,,,,...,,,,,,,,,,
7.0,,,,,,,,,,,...,,,,,,,,,,
8.0,,,,,,,,,,,...,,,,,,,,,,
9.0,,,,,,,,,,,...,,,,,,,,,,
10.0,,,,,,,,,,,...,,,,,,,,,,


# Memory based collaborative filtering 
Use correlation to calculate similarity. We can also use k-nearest neighbor [here](https://www.kaggle.com/code/sns5154/movie-recommendation-system-nearest-neighbors/notebook)


## Item based recommendation 

In [12]:
def item_based_recommender(movie_name, user_movie_df):
    movie_name = user_movie_df[movie_name]
    return user_movie_df.corrwith(movie_name).sort_values(ascending=False).head(10)

In [13]:
dfr = item_based_recommender("Matrix, The (1999)", user_movie_df).to_frame()
dfr.columns=["Corr"]
dfr.head()

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,Corr
title,Unnamed: 1_level_1
"Preacher's Wife, The (1996)",1.0
Getting Even with Dad (1994),1.0
That Darn Cat! (1965),1.0
"House of the Spirits, The (1993)",1.0
"Matrix, The (1999)",1.0


## User-user based

In [14]:
random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=45).values)
random_user_df = user_movie_df[user_movie_df.index == random_user]
random_user_df.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zelig (1983),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70636.0,,,,,,,,,,,...,,,,,,,,,,


In [15]:
#moive that a user watched
movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
movies_watched

['Addicted to Love (1997)',
 'Airplane! (1980)',
 'Before Sunrise (1995)',
 'Capturing the Friedmans (2003)',
 'Donnie Brasco (1997)',
 "Ferris Bueller's Day Off (1986)",
 'Fight Club (1999)',
 'Firm, The (1993)',
 'Gods Must Be Crazy, The (1980)',
 'Good Girl, The (2002)',
 'Hate (Haine, La) (1995)',
 'I Shot Andy Warhol (1996)',
 'In & Out (1997)',
 'Kicking and Screaming (1995)',
 'Lady and the Tramp (1955)',
 'Mighty Wind, A (2003)',
 'Muse, The (1999)',
 'Notting Hill (1999)',
 'Pretty in Pink (1986)',
 'Pulp Fiction (1994)',
 'Repo Man (1984)',
 'Royal Tenenbaums, The (2001)',
 'Seven (a.k.a. Se7en) (1995)',
 'Snow White and the Seven Dwarfs (1937)',
 'South Park: Bigger, Longer and Uncut (1999)',
 'Spice World (1997)',
 'Weird Science (1985)',
 'Women on the Verge of a Nervous Breakdown (Mujeres al borde de un ataque de nervios) (1988)']

In [16]:
#other users who also watch the movies
movies_watched_df = user_movie_df[movies_watched]
#count the watches of selected movies per user
user_movie_count = movies_watched_df.T.notnull().sum() 
user_movie_count = user_movie_count.reset_index()

user_movie_count.columns = ["userId", "movie_count"]
#User Id's of users watched 10 or more movies in common are listed below.
users_same_movies = user_movie_count[user_movie_count["movie_count"] > 5]["userId"]
users_same_movies.head(10)

27117      27749.0
67807      69378.0
69028      70636.0
69273      70883.0
90773      92919.0
93826      96024.0
101605    103986.0
133128    136293.0
Name: userId, dtype: float64

In [17]:
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                      random_user_df[movies_watched]])
final_df.head()

title,Addicted to Love (1997),Airplane! (1980),Before Sunrise (1995),Capturing the Friedmans (2003),Donnie Brasco (1997),Ferris Bueller's Day Off (1986),Fight Club (1999),"Firm, The (1993)","Gods Must Be Crazy, The (1980)","Good Girl, The (2002)",...,Pretty in Pink (1986),Pulp Fiction (1994),Repo Man (1984),"Royal Tenenbaums, The (2001)",Seven (a.k.a. Se7en) (1995),Snow White and the Seven Dwarfs (1937),"South Park: Bigger, Longer and Uncut (1999)",Spice World (1997),Weird Science (1985),Women on the Verge of a Nervous Breakdown (Mujeres al borde de un ataque de nervios) (1988)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27749.0,,,,,,4.0,,3.0,,,...,,,5.0,,,,,,2.0,
69378.0,,3.5,,4.0,,5.0,,3.5,3.5,,...,,,,,,,,,,
70636.0,3.0,5.0,4.0,4.5,4.0,4.0,4.0,3.0,3.0,4.0,...,5.0,5.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,5.0
70883.0,,,,,,3.5,,,3.0,3.0,...,,5.0,,3.5,2.5,,,,,
92919.0,,4.0,,,,,,,4.0,,...,,4.0,,,,,,,,4.0


In [18]:
corr_df = final_df.T.corr().unstack().sort_values(ascending =False).drop_duplicates()
corr_df.head(10)

userId    userId  
27749.0   27749.0     1.000000
69378.0   27749.0     0.866025
70883.0   70636.0     0.734388
96024.0   70883.0     0.596040
103986.0  70636.0     0.485071
          70636.0     0.485071
70636.0   96024.0     0.402911
          96024.0     0.402911
          136293.0    0.196147
          136293.0    0.196147
dtype: float64

In [19]:
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()

corr_df

Unnamed: 0,user_id_1,user_id_2,corr
0,27749.0,27749.0,1.0
1,69378.0,27749.0,0.866025
2,70883.0,70636.0,0.734388
3,96024.0,70883.0,0.59604
4,103986.0,70636.0,0.485071
5,103986.0,70636.0,0.485071
6,70636.0,96024.0,0.402911
7,70636.0,96024.0,0.402911
8,70636.0,136293.0,0.196147
9,70636.0,136293.0,0.196147


In [20]:
#Here, top_users who have 65 percent or higher correlation with our random user.
corr_df = corr_df[corr_df["corr"] >= 0.65]
top_users = corr_df[(corr_df["user_id_1"] == random_user)|(corr_df["user_id_2"] == random_user)]
top_user = top_users.copy()
top_user.rename(columns={"user_id_1": "userId"}, inplace=True)
top_user
# top_users = top_users.sort_values(by='corr', ascending=False)
# top_users.head()

Unnamed: 0,userId,user_id_2,corr
2,70883.0,70636.0,0.734388


In [21]:
top_users_ratings = top_user.merge(rating[["userId", "movieId", "rating"]], how='inner')
top_users_ratings

Unnamed: 0,userId,user_id_2,corr,movieId,rating
0,70883.0,70636.0,0.734388,6,3.5
1,70883.0,70636.0,0.734388,10,3.0
2,70883.0,70636.0,0.734388,12,1.5
3,70883.0,70636.0,0.734388,21,3.0
4,70883.0,70636.0,0.734388,25,3.0
...,...,...,...,...,...
1223,70883.0,70636.0,0.734388,112290,3.5
1224,70883.0,70636.0,0.734388,112556,4.0
1225,70883.0,70636.0,0.734388,112852,5.0
1226,70883.0,70636.0,0.734388,115569,4.0


In [22]:
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']
top_users_ratings.sort_values(by='rating', ascending=False)

Unnamed: 0,userId,user_id_2,corr,movieId,rating,weighted_rating
455,70883.0,70636.0,0.734388,2716,5.0,3.671940
584,70883.0,70636.0,0.734388,3481,5.0,3.671940
468,70883.0,70636.0,0.734388,2791,5.0,3.671940
1219,70883.0,70636.0,0.734388,93840,5.0,3.671940
108,70883.0,70636.0,0.734388,735,5.0,3.671940
...,...,...,...,...,...,...
1065,70883.0,70636.0,0.734388,7016,0.5,0.367194
856,70883.0,70636.0,0.734388,4934,0.5,0.367194
688,70883.0,70636.0,0.734388,3997,0.5,0.367194
950,70883.0,70636.0,0.734388,5673,0.5,0.367194


In [23]:
#only need to group when multiple top users exist
recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})

recommendation_df = recommendation_df.reset_index()
recommendation_df

Unnamed: 0,movieId,weighted_rating
0,6,2.570358
1,10,2.203164
2,12,1.101582
3,21,2.203164
4,25,2.203164
...,...,...
1223,112290,2.570358
1224,112556,2.937552
1225,112852,3.671940
1226,115569,2.937552


In [24]:
movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3.6].sort_values("weighted_rating", ascending=False)

In [25]:
movies_to_be_recommend

Unnamed: 0,movieId,weighted_rating
44,296,3.67194
1120,8622,3.67194
610,3671,3.67194
630,3727,3.67194
698,4027,3.67194
949,5669,3.67194
1062,7000,3.67194
1119,8528,3.67194
1121,8636,3.67194
48,327,3.67194


## Model based approach

The steps in the model are as follows:

1. Map user ID to a "user vector" via an embedding matrix
2. Map movie ID to a "movie vector" via an embedding matrix
3. Compute the dot product between the user vector and movie vector, to obtain the a match score between the user and the movie (predicted rating).
4. Train the embeddings via gradient descent using all known user-movie pairs.

In [26]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [27]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [28]:
# Map user ID to a "user vector" via an embedding matrix
# make it start from 0 - n
user_ids = rating["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

# Map movies ID to a "movies vector" via an embedding matrix
# make it start from 0 - n
movie_ids = rating["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}


rating["user"] = rating["userId"].map(user2user_encoded)
rating["movie"] = rating["movieId"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
rating['rating'] = rating['rating'].values.astype(np.float32)

# min and max ratings will be used to normalize the ratings later
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(f"Number of users: {num_users}, Number of Movies: {num_movies}, Min Rating: {min_rating}, Max Rating: {max_rating}")

Number of users: 138493, Number of Movies: 26744, Min Rating: 0.5, Max Rating: 5.0


In [29]:
rating = rating.sample(frac=1, random_state=42)
x = rating[["user", "movie"]].values

# Normalizing the targets between 0 and 1. Makes it easy to train.
y = rating["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

# Assuming training on 90% of the data and validating on 100%
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)
#x_train = (userid, movieid)

### Create the model

We embed both users and movies in to 50-dimentional vectors. The model computes a match score between user and movie embeddings via a dot product, and adds a per-movie and per-user bias. The match score is scalled to the[0, 1] interval via a sigmoid (since our ratings are normalized to this range).

`layers.Embedding` = Turns positive integers (indexes) into dense vectors of fixed size. It creates an architecture, so we will need to feed in the input to it. 

Example 
```
import tensorflow as tf

samples = 2
texts = tf.random.uniform((samples, 10), maxval=30, dtype=tf.int32)

embedding_layer = tf.keras.layers.Embedding(30, 7, input_length=10)
print(embedding_layer(texts))
```

Output 
```
tf.Tensor(
[[[ 0.0225671   0.02347589  0.00979777  0.00041901 -0.00628462
    0.02810872 -0.00962182]
  [-0.00848696 -0.04342243 -0.02836052 -0.00517335 -0.0061365
   -0.03012114  0.01677728]
  [ 0.03311044  0.00556745 -0.00702027  0.03381392 -0.04623893
    0.04987461 -0.04816799]
  [-0.03521906  0.0379228   0.03005264 -0.0020758  -0.0384485
    0.04822161 -0.02092661]....
```

Example 

In [30]:
embedding_size = 50
embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )

In [31]:
print(embedding(x_train))

tf.Tensor(
[[[-3.9663203e-03 -3.9475187e-04 -2.1161628e-03 ... -3.7183762e-03
   -1.1387502e-04  8.8707916e-04]
  [ 7.5619710e-03  1.5741457e-03  3.8471716e-03 ... -2.3069577e-03
    2.9086631e-03 -3.3338004e-04]]

 [[-1.3696038e-03  1.7134460e-03  5.3330581e-03 ...  8.2881106e-03
    1.3382006e-03 -6.6395109e-03]
  [ 2.1122654e-03 -6.9354088e-03 -4.7350437e-03 ...  7.3360879e-04
    5.9476406e-06  2.0184552e-03]]

 [[-4.6522924e-04  2.9906589e-03  4.8041501e-04 ... -1.1193336e-03
    1.6638360e-03 -8.8621659e-04]
  [-4.4109845e-03 -3.6652136e-04  5.7656672e-03 ... -2.7591006e-03
   -2.8169234e-03 -3.0303893e-03]]

 ...

 [[-3.2612819e-03 -1.0861302e-03 -3.3330009e-03 ...  2.5685932e-04
    3.0494316e-03 -2.8580928e-03]
  [-6.4935591e-03  2.1818187e-03 -2.3694683e-03 ... -1.3059817e-03
   -5.3405594e-03 -2.9090422e-03]]

 [[ 1.6877138e-03 -5.9112313e-04  4.2079836e-03 ... -5.4111995e-04
   -1.4835803e-04 -3.1129045e-03]
  [ 3.3400070e-03  5.3883642e-03  4.9302154e-03 ...  5.6519378e-03

In [32]:
EMBEDDING_SIZE = 50

class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users, #input dimension
            embedding_size, #output dimension
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6)
        )
        self.movie_bias = layers.Embedding(num_movies, 1)
        
    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0]) #all userid
        user_bias = self.user_bias(inputs[:, 0]) 
        movie_vector = self.movie_embedding(inputs[:, 1]) #all movieid
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to be between 0 and 11
        return tf.nn.sigmoid(x)
    
model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
)

Explain `inputs`

In this code, the input is not explicitly specified. However, it is assumed that the input data x_train and x_val are a numpy array of pairs of user and movie IDs, where each row represents a user-movie rating.

During the training phase, the model.fit() method will use these input data pairs to train the model to predict the corresponding rating for each user-movie pair. The model will try to minimize the binary cross-entropy loss between the predicted rating and the true rating (y_train and y_val).

Resource: 
[Customize tensorflow layers](https://studymachinelearning.com/tensorflow-prepare-custom-neural-network-model-with-custom-layers/)

In [33]:
x_train, x_train[:, 0]

(array([[122269,   1154],
        [ 49017,      2],
        [ 89526,   2662],
        ...,
        [ 47815,   1364],
        [ 14183,    589],
        [ 17725,   2504]]),
 array([122269,  49017,  89526, ...,  47815,  14183,  17725]))

In [34]:
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=256,
    epochs=2,
    validation_data=(x_val, y_val)
)

Epoch 1/2

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend()

### Top 10 movie recs for a user 

In [None]:
movie_df = pd.read_csv(movielens_dir / 'movies.csv')

user_id = df.userId.sample(1).iloc[0]
movies_watched_by_user = df[df.userId == user_id]
movies_not_watched = movie_df[~movie_df['movieId'].isin(movies_watched_by_user.movieId.values)]['movieId']

movies_not_watched = list(set(movies_not_watched).intersection(set(movie2movie_encoded.keys())))

movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]

user_encoder = user2user_encoded.get(user_id)

user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)

ratings = model.predict(user_movie_array).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

In [None]:
print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)