## Dimensionality reduction by PCA

### Paths

In [1]:
bx_preprocessed_dataset_path = "bx_data/preprocessed_dataset/"

ratings_path = bx_preprocessed_dataset_path + "preprocessed_ratings_data.pkl"
book_metadata_path = bx_preprocessed_dataset_path + "preprocessed_book_metadata.pkl"

embeddings_save_path = "book_embeddings/pca_book_embeddings.pkl"


### Imports


In [2]:
import copy

import pickle
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

### Load and prepare data

In [3]:
ratings = pd.read_pickle(ratings_path)
books = pd.read_pickle(book_metadata_path)

### Create item to user ratings matrix

In [4]:
item_to_user_ratings_matrix = np.array(ratings.pivot(
    index="Book-Embedding-ID",
    columns="User-Embedding-ID",
    values='Book-Rating').fillna(0).astype("int8"))

### Reduce item user matrix dimensionality to item emb_dim dimensionality (user_count >> emb_dim)

In [5]:
embedding_dim = 35

pca = PCA(n_components=embedding_dim)
embeddings = pca.fit_transform(item_to_user_ratings_matrix)

### Save book embeddings

In [6]:
with open(embeddings_save_path, 'wb') as f:
    pickle.dump(embeddings, f)

## Small recommendation test

In [7]:
with open(embeddings_save_path, 'rb') as f:
    loaded_embeddings = pickle.load(f)
loaded_embeddings.shape

(657, 35)

In [8]:
harry_potter_sorc_stone_emb_id = books["Book-Embedding-ID"][
    books["Book-Title"].str.contains(
        "Harry Potter and the Sorcerer's Stone")]

books[
    books["Book-Title"].str.contains(
        "Harry Potter and the Sorcerer's Stone")]

Unnamed: 0,Book-Embedding-ID,Author-Embedding-ID,Year-Embedding-ID,Book-ID,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
129,34,10383,11,2062,590353403,Harry Potter and the Sorcerer's Stone,J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...


In [9]:
def get_k_nearest_neighbours_model(embeddings, metric="cosine"):
    knn_model = NearestNeighbors(metric=metric, n_jobs=-1)
    knn_model.fit(embeddings)
    return knn_model

def get_k_neighbours_for_vector(vector, knn_model, k=5):
    _, cos_indices = knn_model.kneighbors(
        vector, n_neighbors=k)
    return cos_indices

def convert_emb_ids_to_book_ids(ratings, emb_ids):
  recommended_book_ids = ratings[
      ratings["Book-Embedding-ID"].isin(emb_ids[0])]

  sorted_recommended_book_ids = recommended_book_ids.sort_values(
      by=["Book-Embedding-ID"],
      key=lambda x: x.map(
          {v: i for i, v in enumerate(emb_ids[0])}))

  sorted_recommended_book_ids = sorted_recommended_book_ids["Book-ID"].unique()
  return sorted_recommended_book_ids

def get_book_titles_from_book_ids(books_metadata, book_ids):
  recommended_books = books_metadata[
      books_metadata['Book-ID'].isin(book_ids)].sort_values(
          by=["Book-ID"], key=lambda x: x.map(
              {v: i for i, v in enumerate(book_ids)}))
  return recommended_books['Book-Title'].unique()

def get_book_recommendations(
    emb_id, embeddings,
    book_ratings, book_metadata,
    number_of_recommendations):
  book_emb = embeddings[emb_id].reshape(1,-1)

  knn_model = get_k_nearest_neighbours_model(
      embeddings, metric="cosine")
  recommended_book_emb_ids = get_k_neighbours_for_vector(
      book_emb, knn_model, k=number_of_recommendations)
  recommended_book_ids = convert_emb_ids_to_book_ids(
      book_ratings, recommended_book_emb_ids)
  recommended_book_titles = get_book_titles_from_book_ids(
      book_metadata, recommended_book_ids)

  return recommended_book_titles

In [10]:
get_book_recommendations(
    harry_potter_sorc_stone_emb_id,
    loaded_embeddings, ratings, books, 5)

array(["Harry Potter and the Sorcerer's Stone",
       'Harry Potter and the Chamber of Secrets',
       'Harry Potter and the Prisoner of Azkaban',
       'Harry Potter and the Goblet of Fire',
       'Harry Potter and the Order of the Phoenix'], dtype=object)