In [1]:
#importing packages
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [2]:
# loading datasets
tag = pd.read_csv(r'C:\Users\ck261\Documents\data-mining\tp\ml-latest-small\ml-latest-small\tags.csv')
link = pd.read_csv(r'C:\Users\ck261\Documents\data-mining\tp\ml-latest-small\ml-latest-small\links.csv')
movie = pd.read_csv(r'C:\Users\ck261\Documents\data-mining\tp\ml-latest-small\ml-latest-small\movies.csv')
rating = pd.read_csv(r'C:\Users\ck261\Documents\data-mining\tp\ml-latest-small\ml-latest-small\ratings.csv')

In [34]:
# default value for embedding size parameter
EMBEDDING_SIZE = 50

#input is a keras model instance
class RecommenderNet(keras.Model):

    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)       
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        
        self.user_embedding = layers.Embedding(num_users, embedding_size, embeddings_initializer="he_normal", embeddings_regularizer=keras.regularizers.l1(0.01))
        
        self.user_bias = layers.Embedding(num_users, 1)

        self.movie_embedding = layers.Embedding(num_movies, embedding_size, embeddings_initializer="he_normal", embeddings_regularizer=keras.regularizers.l1(0.01))
        
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])

        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])

        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)

        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to between 0 and 1
        
        return tf.nn.relu(x)

In [35]:
#baseline models
# user means as prediction

# item means as prediction

In [36]:
# grabbing unique values from the user id column and convert to a list
user_ids = rating["userId"].unique().tolist()

user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

movie_ids = rating["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

rating["user"] = rating["userId"].map(user2user_encoded)
rating["movie"] = rating["movieId"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)

rating["rating"] = rating["rating"].values.astype(np.float32)

# min and max ratings will be used to normalize the ratings later
min_rating = min(rating["rating"])
max_rating = max(rating["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)
# output is unique values for the respective variables above

Number of users: 610, Number of Movies: 9724, Min rating: 0.5, Max rating: 5.0


In [37]:
df = rating.sample(frac=1, random_state=421)

x = df[["user", "movie"]].values

# Normalize the targets between 0 and 1. Makes it easy to train.
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

# Assuming training on 75% of the data and validating on 25%.
train_indices = int(0.75 * df.shape[0])

x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)


Model 1:
* 50 latent factors
* evaluating using cross-entropy

Model 2:
* 64 latent factors
* evaluating using root-mean-squared-error (RMSE) as mentioned in project description

both models use the same Adam optimizer with the same hyperparameters

In [38]:
# 50 dimensional vectors, latent factors, representation of the real space
model = RecommenderNet(num_users, num_movies, embedding_size=50)

# sgd = tf.keras.optimizers.SGD(
#     learning_rate=0.01, momentum=0.0, nesterov=False, name="SGD"
# )

# rmse = tf.keras.metrics.RootMeanSquaredError(
#     name='RMSE', dtype=None
# )

# model.compile(
#     loss=rmse, optimizer=sgd
# )


model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(learning_rate=0.01)
)


In [39]:
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
    validation_data=(x_val, y_val),
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [50]:
model2 = RecommenderNet(num_users, num_movies, embedding_size=64)
model2.compile(
    loss=tf.keras.metrics.RootMeanSquaredError(name = 'rmse'), optimizer=keras.optimizers.Adam(learning_rate=0.001)
)


In [51]:
history2 = model2.fit(
    x=x_train,
    y=y_train,
    batch_size=128,
    epochs=5,
    verbose=1,
    validation_data=(x_val, y_val),
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [52]:
model.summary()

Model: "recommender_net_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    multiple                  30500     
                                                                 
 embedding_17 (Embedding)    multiple                  610       
                                                                 
 embedding_18 (Embedding)    multiple                  486200    
                                                                 
 embedding_19 (Embedding)    multiple                  9724      
                                                                 
Total params: 527,036
Trainable params: 527,034
Non-trainable params: 2
_________________________________________________________________


In [55]:
model2.summary()

Model: "recommender_net_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_32 (Embedding)    multiple                  39040     
                                                                 
 embedding_33 (Embedding)    multiple                  610       
                                                                 
 embedding_34 (Embedding)    multiple                  622336    
                                                                 
 embedding_35 (Embedding)    multiple                  9724      
                                                                 
Total params: 671,712
Trainable params: 671,710
Non-trainable params: 2
_________________________________________________________________


In [57]:
movie_df = movie

# Let us get a user and see the top recommendations.
user_id = df.userId.sample(2).iloc[0]
movies_watched_by_user = df[df.userId == user_id]

movies_not_watched = movie_df[
    ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
]["movieId"]

movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)


movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
user_encoder = user2user_encoded.get(user_id)


user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)


ratings = model.predict(user_movie_array).flatten()
top_ratings_indices = ratings.argsort()[-1000:][::-1]


recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 9)

top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)


movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 9)
print("Top 1000 movie recommendations for user")
print("----" * 9)

recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]

for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)


Showing recommendations for user: 435
Movies with high ratings from user
------------------------------------
Billy Madison (1995) : Comedy
Shawshank Redemption, The (1994) : Crime|Drama
Fight Club (1999) : Action|Crime|Drama|Thriller
Kill Bill: Vol. 2 (2004) : Action|Drama|Thriller
Prestige, The (2006) : Drama|Mystery|Sci-Fi|Thriller
------------------------------------
Top 1000 movie recommendations for user
------------------------------------
Persuasion (1995) : Drama|Romance
Cry, the Beloved Country (1995) : Drama
Lamerica (1994) : Adventure|Drama
White Balloon, The (Badkonake sefid) (1995) : Children|Drama
Nobody Loves Me (Keiner liebt mich) (1994) : Comedy|Drama
Catwalk (1996) : Documentary
Chungking Express (Chung Hing sam lam) (1994) : Drama|Mystery|Romance
Jeffrey (1995) : Comedy|Drama
Lord of Illusions (1995) : Horror
Party Girl (1995) : Comedy
Umbrellas of Cherbourg, The (Parapluies de Cherbourg, Les) (1964) : Drama|Musical|Romance
Burnt by the Sun (Utomlyonnye solntsem) (1