In [1]:
import numpy as np
import pandas as pd
import scipy
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd

In [2]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [3]:
def get_sparse(ratings):
    users = list(ratings.user_id.unique())
    books = list(ratings.book_id.unique())
    data = ratings['rating'].tolist()
    row = ratings.user_id.astype('category', categories=users).cat.codes
    col = ratings.book_id.astype('category', categories=books).cat.codes
    sparse_matrix = csr_matrix((data, (row, col)), shape=(len(users), len(books)), dtype = np.dtype('u1'))
    return sparse_matrix

In [40]:
# Load in item feature matrix (description, tags, and shelves)
filename = data_path + 'feature_matrix_reduced.npy'
feature_matrix_reduced = np.load(filename)
feature_matrix_reduced.shape

(10000, 1000)

In [5]:
# Read in goodreads data
ratings_goodreads = pd.read_csv(data_path + 'ratings.csv')
ratings_goodreads['book_id'] = ratings_goodreads['book_id'].astype(int)
users = list(ratings_goodreads.user_id.unique())
len(users)

53424

In [7]:
# Load in amazon ratings
ratings_amazon = pd.read_csv('../../ratings_amazon.csv')
ratings_amazon['book_id'] = ratings_amazon['book_id'].astype(int)

In [8]:
# Get a set of users that rated 5 or more items
user_counts = ratings_amazon['user_id'].value_counts() >= 5
to_drop = set()
for key,value in user_counts.items():
    if not value:
        to_drop.add(key)

In [9]:
# drop users from df that don't have 5 or more ratings
ratings_amazon_reduced = ratings_amazon[~ratings_amazon['user_id'].isin(to_drop)]
users = list(ratings_amazon_reduced.user_id.unique())
len(users)

53802

In [10]:
# combine goodreads ratings with amazon ratings (the 10k books)
print(ratings_goodreads.shape)
print(ratings_amazon_reduced.shape)
df = ratings_goodreads.append(ratings_amazon_reduced)
df = df.sort_values(by=['book_id'])
df.shape

(5976479, 3)
(612249, 3)


(6588728, 3)

In [11]:
user_item = get_sparse(df)

In [12]:
item_user = user_item.T
item_user.shape

(10000, 107226)

In [13]:
# Reduce dimensions of item-user matrix
svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
item_to_concept = svd.fit_transform(item_user)

In [51]:
# Get the max value from item-user matrix U component
item_to_concept_max_value = np.max(item_to_concept)
item_to_concept_max_value

487.06586324887763

In [52]:
# Normalize item-user matrix
item_to_concept_normalized = item_to_concept / item_to_concept_max_value

In [53]:
# Get the max value from feature_matrix_reduced matrix U component
feature_matrix_reduced_max_value = np.max(feature_matrix_reduced)
feature_matrix_reduced_max_value

1.4040590829752724

In [54]:
# Normalize item-feature matrix
feature_matrix_reduced_normalized = feature_matrix_reduced / 1.0

In [55]:
# Horizontally combine book features and book concepts
item_matrix = np.hstack((item_to_concept_normalized, feature_matrix_reduced_normalized))
item_matrix.shape

(10000, 2000)

In [56]:
# Save new combined item to feature and concept matrix
filename = data_path + 'item_matrix.npy'
np.save(filename, item_matrix)