## Matrix factorization using Gradient Descent

Solution inspired from https://developers.google.com/machine-learning/recommendation/overview.

### Paths

In [1]:
bx_preprocessed_dataset_path = "bx_data/preprocessed_dataset/"

ratings_path = bx_preprocessed_dataset_path + "preprocessed_ratings_data.pkl"
book_metadata_path = bx_preprocessed_dataset_path + "preprocessed_book_metadata.pkl"

embeddings_save_path = "book_embeddings/gd_book_embeddings.pkl"


### Imports


In [2]:
import copy

import pickle
import pandas as pd
import numpy as np

from tqdm import tqdm
import plotly.graph_objects as go

import torch
import torch.nn.functional as F
from sklearn.neighbors import NearestNeighbors

### Load and prepare data

In [3]:
ratings = pd.read_pickle(ratings_path)
books = pd.read_pickle(book_metadata_path)

In [4]:
# Split the ratings dataframe into train and test parts
def split_dataframe(ratings_df, test_size=0.2):

    test_indices = np.array(
        torch.randperm(len(ratings_df)) < len(ratings_df) * test_size)

    train_df = ratings_df[~test_indices]
    test_df = ratings_df[test_indices]

    return train_df, test_df

# SparseTensor representation of the train and test datasets
def build_rating_sparse_tensor(ratings_df, users_count, book_count):

    indices = torch.tensor(
        ratings_df[['User-Embedding-ID', 'Book-Embedding-ID']].values,
        dtype=torch.long).t()
    values = torch.tensor(
        ratings_df['Book-Rating'].values,
        dtype=torch.float)
    shape = (users_count, book_count)

    sparse_rating_tensor = torch.sparse.FloatTensor(
        indices=indices,
        values=values,
        size=shape)

    return sparse_rating_tensor

In [5]:
# Calculate the mean squared error loss
def sparse_mse_loss_func(sparse_ratings, user_embeddings, movie_embeddings):
    sparse_ratings = sparse_ratings.coalesce()
    user_indices = sparse_ratings.indices()[0,:]
    movie_indices = sparse_ratings.indices()[1,:]

    user_selected = user_embeddings(user_indices)
    movie_selected = movie_embeddings(movie_indices)

    predictions = torch.sum(user_selected * movie_selected, axis=1)
    loss = F.mse_loss(predictions, sparse_ratings.values())

    return loss


# Regularization by gravity function
def gravity_loss_func(U, V):
    return 1 / (U.shape[0] * V.shape[0]) * torch.sum(
        torch.matmul(U.t(), U) * torch.matmul(V.t(), V))


# Regularization by L1 norm
def regularization_loss_func(U, V):
    return (torch.sum(U * U) / U.shape[0] +
            torch.sum(V * V) / V.shape[0])

### Define Gradient Descent model for Matrix Factorization

In [6]:
class GDModel(torch.nn.Module):
    def __init__(self,
                 num_users, num_books,
                 embedding_dim, weight_init_std=0.5):
        super(GDModel, self).__init__()

        self.user_embeddings = torch.nn.Embedding(
            num_users, embedding_dim)
        self.book_embeddings = torch.nn.Embedding(
            num_books, embedding_dim)

        self.user_embeddings.weight.data.normal_(
            std=weight_init_std)
        self.book_embeddings.weight.data.normal_(
            std=weight_init_std)

    def forward(self, input_ids):

        selected_users = self.user_embeddings(input_ids[0])
        selected_books = self.book_embeddings(input_ids[1])
        predictions = torch.sum(selected_users * selected_books, dim=1)

        return predictions

### Training loop

In [7]:
# Number of embedded items for model embedding layers
users_count = ratings["User-Embedding-ID"].nunique()
book_count = ratings["Book-Embedding-ID"].nunique()
print(f"Users {users_count} and Books {book_count}")

Users 792 and Books 657


In [8]:
# Training hyperparameters
epoch_count = 1000
learning_rate = 0.001
embedding_dim = 35

gravity_loss_ratio = 1
reg_loss_ratio = 0.1

In [9]:
train_ratings, test_ratings = split_dataframe(ratings)
train_data = build_rating_sparse_tensor(train_ratings, users_count, book_count)
test_data = build_rating_sparse_tensor(test_ratings, users_count, book_count)

# Initialize model
gd_model = GDModel(users_count, book_count, embedding_dim)

# Initialize optimizer
optimizer = torch.optim.AdamW(gd_model.parameters(), lr=learning_rate)

# Initialize loss variables
losses = []
best_loss = 10000
best_weights = None

# Training loop
for epoch in range(epoch_count):

    optimizer.zero_grad()

    train_loss = sparse_mse_loss_func(
        train_data,
        gd_model.user_embeddings,
        gd_model.book_embeddings)
    grav_loss = gravity_loss_ratio * gravity_loss_func(
        gd_model.book_embeddings.weight,
        gd_model.user_embeddings.weight)
    reg_loss = reg_loss_ratio * regularization_loss_func(
        gd_model.book_embeddings.weight,
        gd_model.user_embeddings.weight)

    loss = train_loss + reg_loss + grav_loss
    loss.backward()

    optimizer.step()

    test_loss = sparse_mse_loss_func(
        test_data,
        gd_model.user_embeddings,
        gd_model.book_embeddings)

    # Save best test weights
    if test_loss < best_loss:
        best_epoch = epoch
        train_loss_at_best = train_loss.item()
        best_loss = test_loss.item()

        del best_weights
        best_weights = copy.deepcopy(gd_model.state_dict())

    losses.append((train_loss.item(), test_loss.item()))

    # Visualize training progress
    with tqdm(total=1, desc=f'Epoch {epoch+1}/{epoch_count}', unit='epoch') as t:
        t.set_description(f'Epoch {epoch+1}/{epoch_count}')
        t.set_postfix(train_loss=train_loss.item(),
                      test_loss=test_loss.item())
        t.update()

    del train_loss
    del test_loss
    del loss

Epoch 1/1000: 100%|██████████| 1/1 [00:00<00:00, 127.34epoch/s, test_loss=18.5, train_loss=18.8]
Epoch 2/1000: 100%|██████████| 1/1 [00:00<00:00, 309.54epoch/s, test_loss=18.5, train_loss=18.8]
Epoch 3/1000: 100%|██████████| 1/1 [00:00<00:00, 215.88epoch/s, test_loss=18.5, train_loss=18.7]
Epoch 4/1000: 100%|██████████| 1/1 [00:00<00:00, 205.58epoch/s, test_loss=18.5, train_loss=18.7]
Epoch 5/1000: 100%|██████████| 1/1 [00:00<00:00, 246.74epoch/s, test_loss=18.5, train_loss=18.7]
Epoch 6/1000: 100%|██████████| 1/1 [00:00<00:00, 228.24epoch/s, test_loss=18.5, train_loss=18.7]
Epoch 7/1000: 100%|██████████| 1/1 [00:00<00:00, 156.08epoch/s, test_loss=18.5, train_loss=18.6]
Epoch 8/1000: 100%|██████████| 1/1 [00:00<00:00, 217.66epoch/s, test_loss=18.5, train_loss=18.6]
Epoch 9/1000: 100%|██████████| 1/1 [00:00<00:00, 229.23epoch/s, test_loss=18.4, train_loss=18.6]
Epoch 10/1000: 100%|██████████| 1/1 [00:00<00:00, 230.52epoch/s, test_loss=18.4, train_loss=18.5]
Epoch 11/1000: 100%|█████████

In [10]:
best_model = GDModel(users_count, book_count, embedding_dim)
best_model.load_state_dict(best_weights)
print(f"EPOCH {best_epoch} had lowest TEST LOSS: {round(best_loss, 3)}\
 with TRAIN LOSS: {round(train_loss_at_best, 3)}")

EPOCH 718 had lowest TEST LOSS: 13.494 with TRAIN LOSS: 6.411


### Save book embeddings

In [11]:
embeddings = gd_model.book_embeddings.weight.detach().numpy()
with open(embeddings_save_path, 'wb') as f:
    pickle.dump(embeddings, f)

### Plot loss progression

In [12]:
train_loss = [item[0] for item in losses]
test_loss = [item[1] for item in losses]

window_size = 20
train_loss_avg = np.convolve(
    train_loss,
    np.ones(window_size) / window_size,
    mode='valid')
test_loss_avg = np.convolve(
    test_loss,
    np.ones(window_size) / window_size,
    mode='valid')

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(losses))),
                         y=train_loss, mode='lines', name='Train Loss'))
fig.add_trace(go.Scatter(x=list(range(len(losses))),
                         y=test_loss, mode='lines', name='Test Loss'))
fig.add_trace(go.Scatter(x=list(range(window_size-1, len(losses))),
                         y=train_loss_avg,
                         mode='lines', name='Averaged Train Loss',
                         line=dict(dash='dash', width=5, color='blue')))
fig.add_trace(go.Scatter(x=list(range(window_size-1, len(losses))),
                         y=test_loss_avg,
                         mode='lines', name='Averaged Test Loss',
                         line=dict(dash='dash', width=5, color='red')))

fig.update_layout(
    title=f'Average batch mean loss during training\
 from dataset with {users_count} users and {book_count} books',
    xaxis_title='Epoch',
    yaxis_title='Loss'
)
fig.show()

## Small recommendation test

In [13]:
with open(embeddings_save_path, 'rb') as f:
    loaded_embeddings = pickle.load(f)
loaded_embeddings.shape

(657, 35)

In [14]:
harry_potter_sorc_stone_emb_id = books["Book-Embedding-ID"][
    books["Book-Title"].str.contains(
        "Harry Potter and the Sorcerer's Stone")]

books[
    books["Book-Title"].str.contains(
        "Harry Potter and the Sorcerer's Stone")]

Unnamed: 0,Book-Embedding-ID,Author-Embedding-ID,Year-Embedding-ID,Book-ID,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
129,34,10383,11,2062,590353403,Harry Potter and the Sorcerer's Stone,J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...


In [15]:
def get_k_nearest_neighbours_model(embeddings, metric="cosine"):
    knn_model = NearestNeighbors(metric=metric, n_jobs=-1)
    knn_model.fit(embeddings)
    return knn_model

def get_k_neighbours_for_vector(vector, knn_model, k=5):
    _, cos_indices = knn_model.kneighbors(
        vector, n_neighbors=k)
    return cos_indices

def convert_emb_ids_to_book_ids(ratings, emb_ids):
  recommended_book_ids = ratings[
      ratings["Book-Embedding-ID"].isin(emb_ids[0])]

  sorted_recommended_book_ids = recommended_book_ids.sort_values(
      by=["Book-Embedding-ID"],
      key=lambda x: x.map(
          {v: i for i, v in enumerate(emb_ids[0])}))

  sorted_recommended_book_ids = sorted_recommended_book_ids["Book-ID"].unique()
  return sorted_recommended_book_ids

def get_book_titles_from_book_ids(books_metadata, book_ids):
  recommended_books = books_metadata[
      books_metadata['Book-ID'].isin(book_ids)].sort_values(
          by=["Book-ID"], key=lambda x: x.map(
              {v: i for i, v in enumerate(book_ids)}))
  return recommended_books['Book-Title'].unique()

def get_book_recommendations(
    emb_id, embeddings,
    book_ratings, book_metadata,
    number_of_recommendations):
  book_emb = embeddings[emb_id].reshape(1,-1)

  knn_model = get_k_nearest_neighbours_model(
      embeddings, metric="cosine")
  recommended_book_emb_ids = get_k_neighbours_for_vector(
      book_emb, knn_model, k=number_of_recommendations)
  recommended_book_ids = convert_emb_ids_to_book_ids(
      book_ratings, recommended_book_emb_ids)
  recommended_book_titles = get_book_titles_from_book_ids(
      book_metadata, recommended_book_ids)

  return recommended_book_titles

In [16]:
get_book_recommendations(
    harry_potter_sorc_stone_emb_id,
    loaded_embeddings, ratings, books, 5)

array(["Harry Potter and the Sorcerer's Stone",
       'Harry Potter and the Chamber of Secrets',
       'Harry Potter and the Prisoner of Azkaban', 'Jane Eyre',
       'Harry Potter and the Goblet of Fire'], dtype=object)