# Lesson 6a:  Matrix Factorization with Keras

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

The steps in the model are as follows:ç

+ Map user ID to a "user vector" via an embedding matrix
+ Map movie ID to a "movie vector" via an embedding matrix
+ Compute the dot product between the user vector and movie vector, to obtain the a match score between the user and the movie (predicted rating).
+ Train the embeddings via gradient descent using all known user-movie pairs.

In [2]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm

!mkdir model

mkdir: model: File exists


## Load Data and apply preprocessing

In [3]:
# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
# Use the ratings.csv file
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-1m.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-1m"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")


def load_movies(movielens_dir):
    COL_NAME = ['mid','movie_name','movie_genre']
    df = pd.read_csv(movielens_dir / 'movies.dat',sep='::', header=None, engine='python', names=COL_NAME, encoding='latin-1')
    df.columns= ['movie_id','title','movie_genre']
    return df


def load_ratings(movielens_dir):
    COL_NAME = ['uid','mid','rating','timestamp']
    df = pd.read_csv(movielens_dir / 'ratings.dat',sep='::', header=None, engine='python', names=COL_NAME)
    df.columns=['user_id','movie_id','rating','timestamp']
    return df

df = load_ratings(movielens_dir)

First, need to perform some preprocessing to encode users and movies as integer indices.

In [4]:
user_ids = df["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movie_id"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["user_id"].map(user2user_encoded)
df["movie"] = df["movie_id"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["rating"] = df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 6040, Number of Movies: 3706, Min rating: 1.0, Max rating: 5.0


In [5]:
df

Unnamed: 0,user_id,movie_id,rating,timestamp,user,movie
0,1,1193,5.0,978300760,0,0
1,1,661,3.0,978302109,0,1
2,1,914,3.0,978301968,0,2
3,1,3408,4.0,978300275,0,3
4,1,2355,5.0,978824291,0,4
...,...,...,...,...,...,...
1000204,6040,1091,1.0,956716541,6039,772
1000205,6040,1094,5.0,956704887,6039,1106
1000206,6040,562,5.0,956704746,6039,365
1000207,6040,1096,4.0,956715648,6039,152


### Prepare training and validation data

In [6]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.2, random_state=7)

In [7]:
x_train = train[["user", "movie"]].values
y_train = train["rating"].values

x_val = val[["user", "movie"]].values
y_val = val["rating"].values

## Evaluation function

We make a function in order to plot the metrics/losses graphics once the model has been trained.

In [8]:
def plot_train_val_losses(history):
    plt.plot(history.history["loss"])
    plt.plot(history.history["val_loss"])
    plt.title("model loss")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.legend(["train", "test"], loc="upper left")
    plt.axis([0,len(history.history["loss"]),np.min(history.history["loss"]),np.max(history.history["val_loss"])])
    plt.show()

## Create MF model

We embed both users and movies in to 10-dimensional vectors.

The model computes a match score between user and movie embeddings via a dot product, and adds a per-movie and per-user bias.

In [9]:
EMBEDDING_SIZE = 10

class RecommenderNetV(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNetV, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_movie_embedding = layers.Embedding(
            num_users+num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_movie_embedding(inputs[:, 0])
        movie_vector = self.user_movie_embedding(inputs[:, 1]+num_users)
        user_bias = self.user_bias(inputs[:, 0])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        return x


mf_model = RecommenderNetV(num_users, num_movies, EMBEDDING_SIZE)
mf_model.compile(
    loss=tf.keras.losses.MeanSquaredError(), 
    optimizer=keras.optimizers.Adam(learning_rate=0.0005)
)

2023-03-08 18:15:27.553677: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Train and evaluate MF model

In [10]:
mf_history = mf_model.fit(
    x=x_train,
    y=y_train,
    batch_size=128,
    epochs=5,
    verbose=1,
    validation_data=(x_val, y_val),
)

2023-03-08 18:15:28.924464: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## UI Demonstration

Show top 10 movie recommendations to a user

In [11]:
movie_df = load_movies(movielens_dir)
# Let us get a user and see the top recommendations.
user_id = df.user_id.sample(1).iloc[0]
movies_watched_by_user = df[df.user_id == user_id]
movies_not_watched = movie_df[~movie_df["movie_id"].isin(movies_watched_by_user.movie_id.values)]["movie_id"]
movies_not_watched = list(set(movies_not_watched).intersection(set(movie2movie_encoded.keys())))
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]

user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), movies_not_watched))

ratings = mf_model.predict(user_movie_array).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (movies_watched_by_user.sort_values(by="rating", ascending=False).head(5).movie_id.values)

movie_df_rows = movie_df[movie_df["movie_id"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.movie_genre)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movie_id"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.movie_genre)

Showing recommendations for user: 4506
Movies with high ratings from user
--------------------------------
Monty Python and the Holy Grail (1974) : Comedy
Thin Blue Line, The (1988) : Documentary
To Kill a Mockingbird (1962) : Drama
Birds, The (1963) : Horror
Deliverance (1972) : Adventure|Thriller
--------------------------------
Top 10 movie recommendations
--------------------------------
Usual Suspects, The (1995) : Crime|Thriller
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) : Film-Noir
All About Eve (1950) : Drama
Wrong Trousers, The (1993) : Animation|Comedy
Third Man, The (1949) : Mystery|Thriller
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) : Action|Drama
Strangers on a Train (1951) : Film-Noir|Thriller
Sixth Sense, The (1999) : Thriller
General, The (1927) : Comedy
Yojimbo (1961) : Comedy|Drama|Western


In [12]:
def top_recomendations(model_object, user_id, train, movie_df,at = 5):
    movies_watched_by_user = train[train.user_id == user_id]

    movies_not_watched = movie_df[~movie_df["movie_id"].isin(movies_watched_by_user.movie_id.values)]["movie_id"]
    movies_not_watched = list(set(movies_not_watched).intersection(set(movie2movie_encoded.keys())))
    movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]

    
    user_encoder = user2user_encoded.get(user_id)
    user_movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), movies_not_watched))

    ratings = model_object.predict(user_movie_array,verbose=0).flatten()
    top_ratings_indices = ratings.argsort()[-at:][::-1]
    recommended_movie_ids = [movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices]
    return recommended_movie_ids
r = top_recomendations(mf_model, 4,train,movie_df,10)
print(r) 

[318, 2019, 527, 1148, 745, 858, 50, 922, 904, 1212]


In [13]:
def precision(recommended_items, relevant_items):
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    
    return precision_score

def recall(recommended_items, relevant_items):  
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def AP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

def evaluate_algorithm_top(train, test, recommender_object, movie_df, at=25, thr_relevant = 0.85):
    
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0
    
    num_eval = 0


    for user_id in tqdm(test.user_id.unique()):
        
        relevant_items = test[test.user_id==user_id]
        thr = np.quantile(relevant_items.rating,thr_relevant)
        relevant_items = np.array(relevant_items[relevant_items.rating >=thr].movie_id.values)
        if len(relevant_items)>0:
            
            recommended_items = top_recomendations(recommender_object,user_id, train, movie_df, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)
            
    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    MAP = cumulative_AP / num_eval
    
    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, MAP)) 

In [14]:
evaluate_algorithm_top(train, val, mf_model, movie_df, at = 25)

100%|███████████████████████████████████████| 6037/6037 [05:14<00:00, 19.19it/s]

Recommender results are: Precision = 0.0362, Recall = 0.0927, MAP = 0.0205



