## Matrix factorization using SoftMax model

Solution inspired from https://developers.google.com/machine-learning/recommendation/overview.

### Paths

In [1]:
drive_path = "/content/drive/MyDrive/Colab Notebooks/Codesentics/notebooks/"
bx_preprocessed_dataset_path = drive_path + "bx_data/preprocessed_dataset/"

ratings_path = bx_preprocessed_dataset_path + "preprocessed_ratings_data.pkl"
book_metadata_path = bx_preprocessed_dataset_path + "preprocessed_book_metadata.pkl"

embeddings_save_path = drive_path + "book_embeddings/softmax_book_embeddings.pkl"

### Imports

In [2]:
import random
import copy

import pickle
import pandas as pd
import numpy as np

from tqdm import tqdm
import plotly.graph_objects as go

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.neighbors import NearestNeighbors

### Load and prepare data

In [3]:
ratings = pd.read_pickle(ratings_path)
books = pd.read_pickle(book_metadata_path)

In [4]:
# Create dictionary: user -> [list of his highly rated books]
high_ratings = ratings[ratings["Book-Rating"] > 6]
user_to_books_df = high_ratings[["User-Embedding-ID", "Book-Embedding-ID"]] \
    .groupby("User-Embedding-ID", as_index=False) \
    .aggregate(lambda x: list(x))

In [5]:
# Create dictionary for book embedding ID to year embedding ID conversion
book_to_year_dict = {
    book: year for book, year in zip(
        books["Book-Embedding-ID"],
        books["Year-Embedding-ID"])
}
# Create dictionary for book embedding ID to author embedding ID conversion
book_to_author_dict = {
    book: author
    for book, author in zip(
        books["Book-Embedding-ID"],
        books["Author-Embedding-ID"])
}

### Define dataset implementation

In [6]:
class SoftMaxDataset(torch.utils.data.Dataset):
    def __init__(self, user_to_books_df, book_to_year_dict, book_to_author_dict):
        super(SoftMaxDataset, self).__init__()
        self.user_to_books_df = user_to_books_df
        self.book_to_year_dict = book_to_year_dict
        self.book_to_author_dict = book_to_author_dict

    # Length is defined by number of unique users that rated books
    def __len__(self):
        return len(self.user_to_books_df)

    # Generate following tuple: (book, book-author, book-year) -> similar_book
    def __getitem__(self, idx=None):

        # Get random user books
        user_book_ids = self.user_to_books_df["Book-Embedding-ID"].sample().values[0]

        # Sample his books and recommend another
        input_book_id = random.choice(user_book_ids)
        year_id = self.book_to_year_dict[input_book_id]
        author_id = self.book_to_author_dict[input_book_id]
        recommend_book_id = random.choice(user_book_ids)

        return torch.tensor([
            input_book_id, year_id, author_id,
            recommend_book_id], dtype=torch.long)

# Divide data into train/test parts and return their dataloaders
def get_dataloaders(
    data, years_dict, author_dict,
    train_batch_size, train_ratio=0.8):

    train_data = data.sample(frac=train_ratio)
    test_data = data.drop(train_data.index)

    sm_train_dataset = SoftMaxDataset(train_data, years_dict, author_dict)
    sm_test_dataset = SoftMaxDataset(test_data, years_dict, author_dict)

    train_dataloader = torch.utils.data.DataLoader(
        sm_train_dataset, batch_size=train_batch_size,
        shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(
        sm_test_dataset, batch_size=len(sm_test_dataset),
        shuffle=False)

    return train_dataloader, test_dataloader

### Define model implementation

In [7]:
class SoftMaxModel(nn.Module):
    def __init__(self,
                 pub_year_count, authors_count, book_ids_count,
                 in_embed_dims=(3,5,27), init_stddev=0.5):
        super(SoftMaxModel, self).__init__()

        embedding_dim = sum(in_embed_dims)

        # Layers used to embed book for which are made recommendations
        self.year_embedding_layer = nn.Embedding(
            pub_year_count, in_embed_dims[0])
        self.author_embedding_layer = nn.Embedding(
            authors_count, in_embed_dims[1])
        self.input_book_embedding_layer = nn.Embedding(
            book_ids_count, in_embed_dims[2])

        # Layers used for computing batch of user embeddings
        self.linear_1_layer = nn.Linear(embedding_dim, embedding_dim // 2)
        self.bn1 = nn.BatchNorm1d(embedding_dim // 2)
        self.linear_2_layer = nn.Linear(embedding_dim // 2, embedding_dim)
        self.bn2 = nn.BatchNorm1d(embedding_dim)
        self.relu = nn.ReLU()

        # Layer used to store book embeddings for recommendation
        self.book_embeddings = torch.nn.Embedding(book_ids_count, embedding_dim)

        # Initialize weights
        self.initialize_weights(init_stddev)

    # Normal distribution initialization
    def initialize_weights(self, init_stddev):
        self.year_embedding_layer.weight.data.normal_(std=init_stddev)
        self.author_embedding_layer.weight.data.normal_(std=init_stddev)
        self.input_book_embedding_layer.weight.data.normal_(std=init_stddev)
        self.linear_1_layer.weight.data.normal_(std=init_stddev)
        self.linear_2_layer.weight.data.normal_(std=init_stddev)
        self.book_embeddings.weight.data.normal_(std=init_stddev)

    def forward(self, input_ids):

        # Embed input book
        year_emb = self.year_embedding_layer(input_ids[:,1])
        author_emb = self.author_embedding_layer(input_ids[:,2])
        book_emb = self.input_book_embedding_layer(input_ids[:,0])

        # Concatenate input book embeddings
        x = torch.cat((book_emb, author_emb, year_emb), dim=1)

        # Compute user embedding
        x = self.relu(self.bn1(self.linear_1_layer(x)))
        x = self.bn2(self.linear_2_layer(x))

        return x

# Compute loss with goal of having user embedding
# contain his other highly rated books (stored in labels)
def softmax_loss(user_embs, book_embs, labels):
    logits = torch.matmul(user_embs, book_embs.T)
    return F.cross_entropy(logits, labels)

### Training loop

In [8]:
# Number of embedded items for model embedding layers
book_ids_count = max(book_to_year_dict.keys()) + 1
year_ids_count = max(book_to_year_dict.values()) + 1
author_ids_count = max(book_to_author_dict.values()) + 1

In [9]:
# Training hyperparameters
train_data_ratio = 0.8
train_batch_size = 30
epochs = 1000
learning_rate = 0.1
in_embed_dims = (3,5,27)  # (Year, Author, Input book) embedding dimensions

In [10]:
# Prepare dataloaders
train_dataloader, test_dataloader = get_dataloaders(
    user_to_books_df,
    book_to_year_dict, book_to_author_dict,
    train_batch_size, train_data_ratio)

# Initialize model
model = SoftMaxModel(
    year_ids_count, author_ids_count,
    book_ids_count, in_embed_dims)

# Initialize optimizer
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Initialize loss tracking variables
epoch_losses = []
best_loss = 100000

# Training loop
for epoch in range(epochs):

    epoch_train_loss = 0
    epoch_test_loss = 0

    # Training phase
    for batch in train_dataloader:

        optimizer.zero_grad()

        batch_data = batch[:,:-1]
        labels = batch[:,-1]

        if batch_data.shape[0] < 2:
            break

        batch_of_user_embeddings = model(batch_data)

        train_loss = softmax_loss(
            batch_of_user_embeddings,
            model.book_embeddings.weight,
            labels)
        train_loss.backward()
        epoch_train_loss += train_loss.item()

        optimizer.step()

    # Test phase
    with torch.no_grad():
        for batch in test_dataloader:
            batch_data = batch[:,:-1]
            labels = batch[:,-1]

            batch_of_user_embeddings = model(batch_data)
            test_loss = softmax_loss(
                batch_of_user_embeddings,
                model.book_embeddings.weight,
                labels)
            epoch_test_loss += test_loss.item()

        # Save best test weights
        if epoch_test_loss < best_loss:
            best_epoch = epoch + 1
            train_loss_at_best = train_loss.item()
            best_loss = test_loss.item()
            best_weights = copy.deepcopy(model.state_dict())

    # Save epoch average of mean batch loss
    avg_batch_train_loss = epoch_train_loss / len(train_dataloader)
    avg_batch_test_loss = epoch_test_loss / len(test_dataloader)
    epoch_losses.append((avg_batch_train_loss, avg_batch_test_loss))

    # Visualize training progress
    with tqdm(total=1, desc=f'Epoch {epoch+1}/{epochs}', unit='epoch') as t:
        t.set_description(f'Epoch {epoch+1}/{epochs}')
        t.set_postfix(train_loss=avg_batch_train_loss,
                      test_loss=avg_batch_test_loss)
        t.update()

Epoch 1/1000: 100%|██████████| 1/1 [00:00<00:00, 121.46epoch/s, test_loss=7.88, train_loss=8.85]
Epoch 2/1000: 100%|██████████| 1/1 [00:00<00:00, 93.25epoch/s, test_loss=7.09, train_loss=7.28]
Epoch 3/1000: 100%|██████████| 1/1 [00:00<00:00, 149.17epoch/s, test_loss=6.73, train_loss=6.6]
Epoch 4/1000: 100%|██████████| 1/1 [00:00<00:00, 186.45epoch/s, test_loss=6.62, train_loss=6.32]
Epoch 5/1000: 100%|██████████| 1/1 [00:00<00:00, 146.74epoch/s, test_loss=6.52, train_loss=6.25]
Epoch 6/1000: 100%|██████████| 1/1 [00:00<00:00, 105.71epoch/s, test_loss=6.52, train_loss=6.2]
Epoch 7/1000: 100%|██████████| 1/1 [00:00<00:00, 185.02epoch/s, test_loss=6.45, train_loss=6.19]
Epoch 8/1000: 100%|██████████| 1/1 [00:00<00:00, 116.13epoch/s, test_loss=6.52, train_loss=6.15]
Epoch 9/1000: 100%|██████████| 1/1 [00:00<00:00, 189.11epoch/s, test_loss=6.53, train_loss=6.17]
Epoch 10/1000: 100%|██████████| 1/1 [00:00<00:00, 91.67epoch/s, test_loss=6.5, train_loss=6.16]
Epoch 11/1000: 100%|██████████| 1/

In [11]:
best_model = SoftMaxModel(
    year_ids_count, author_ids_count,
    book_ids_count, in_embed_dims)
best_model.load_state_dict(best_weights)
print(f"EPOCH {best_epoch} had lowest TEST LOSS: {round(best_loss, 3)} \
with TRAIN LOSS: {round(train_loss_at_best, 3)}")

EPOCH 940 had lowest TEST LOSS: 5.931 with TRAIN LOSS: 5.846


### Save book embeddings

In [12]:
embeddings = best_model.book_embeddings.weight.detach().numpy()
with open(embeddings_save_path, 'wb') as f:
    pickle.dump(embeddings, f)

### Plot loss progression

In [13]:
train_loss = [item[0] for item in epoch_losses]
test_loss = [item[1] for item in epoch_losses]

window_size = 20
train_loss_avg = np.convolve(
    train_loss,
    np.ones(window_size) / window_size,
    mode='valid')
test_loss_avg = np.convolve(
    test_loss,
    np.ones(window_size) / window_size,
    mode='valid')

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(epoch_losses))),
                         y=train_loss, mode='lines', name='Train Loss'))
fig.add_trace(go.Scatter(x=list(range(len(epoch_losses))),
                         y=test_loss, mode='lines', name='Test Loss'))
fig.add_trace(go.Scatter(x=list(range(window_size-1, len(epoch_losses))),
                         y=train_loss_avg,
                         mode='lines', name='Averaged Train Loss',
                         line=dict(dash='dash', width=5, color='blue')))
fig.add_trace(go.Scatter(x=list(range(window_size-1, len(epoch_losses))),
                         y=test_loss_avg,
                         mode='lines', name='Averaged Test Loss',
                         line=dict(dash='dash', width=5, color='red')))

fig.update_layout(
    title=f'Average batch mean loss during training\
 from dataset with {len(user_to_books_df)} users and {book_ids_count} books',
    xaxis_title='Epoch',
    yaxis_title='Loss'
)
fig.show()

## Small recommendation test

In [14]:
# Load tensor from the pickle file
with open(embeddings_save_path, 'rb') as f:
    loaded_embeddings = pickle.load(f)

In [15]:
harry_potter_sorc_stone_emb_id = books["Book-Embedding-ID"][
    books["Book-Title"].str.contains(
        "Harry Potter and the Sorcerer's Stone")]

books[
    books["Book-Title"].str.contains(
        "Harry Potter and the Sorcerer's Stone")]

Unnamed: 0,Book-Embedding-ID,Author-Embedding-ID,Year-Embedding-ID,Book-ID,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
129,34,10383,11,2062,590353403,Harry Potter and the Sorcerer's Stone,J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...


In [16]:
def get_k_nearest_neighbours_model(embeddings, metric="cosine"):
    knn_model = NearestNeighbors(metric=metric, n_jobs=-1)
    knn_model.fit(embeddings)
    return knn_model

def get_k_neighbours_for_vector(vector, knn_model, k=5):
    _, cos_indices = knn_model.kneighbors(
        vector, n_neighbors=k)
    return cos_indices

def convert_emb_ids_to_book_ids(ratings, emb_ids):
  recommended_book_ids = ratings[
      ratings["Book-Embedding-ID"].isin(emb_ids[0])]

  sorted_recommended_book_ids = recommended_book_ids.sort_values(
      by=["Book-Embedding-ID"],
      key=lambda x: x.map(
          {v: i for i, v in enumerate(emb_ids[0])}))

  sorted_recommended_book_ids = sorted_recommended_book_ids["Book-ID"].unique()
  return sorted_recommended_book_ids

def get_book_titles_from_book_ids(books_metadata, book_ids):
  recommended_books = books_metadata[
      books_metadata['Book-ID'].isin(book_ids)].sort_values(
          by=["Book-ID"], key=lambda x: x.map(
              {v: i for i, v in enumerate(book_ids)}))
  return recommended_books['Book-Title'].unique()

def get_book_recommendations(
    emb_id, embeddings,
    book_ratings, book_metadata,
    number_of_recommendations):
  book_emb = embeddings[emb_id].reshape(1,-1)

  knn_model = get_k_nearest_neighbours_model(
      embeddings, metric="cosine")
  recommended_book_emb_ids = get_k_neighbours_for_vector(
      book_emb, knn_model, k=number_of_recommendations)
  recommended_book_ids = convert_emb_ids_to_book_ids(
      book_ratings, recommended_book_emb_ids)
  recommended_book_titles = get_book_titles_from_book_ids(
      book_metadata, recommended_book_ids)

  return recommended_book_titles

In [17]:
get_book_recommendations(
    harry_potter_sorc_stone_emb_id,
    loaded_embeddings, ratings, books, 5)

array(["Harry Potter and the Sorcerer's Stone",
       'Harry Potter and the Chamber of Secrets', 'The Horse Whisperer',
       'Harry Potter and the Goblet of Fire',
       'Harry Potter and the Prisoner of Azkaban'], dtype=object)