In [1]:
import numpy as np
import pandas as pd
import scipy
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd

In [206]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [53]:
def get_sparse(ratings):
    users = list(ratings.user_id.unique())
    books = list(ratings.book_id.unique())
    data = ratings['rating'].tolist()
    row = ratings.user_id.astype('category', categories=users).cat.codes
    col = ratings.book_id.astype('category', categories=books).cat.codes
    sparse_matrix = csr_matrix((data, (row, col)), shape=(len(users), len(books)), dtype = np.dtype('u1'))
    return sparse_matrix

In [207]:
# Load in item feature matrix (description, tags, and shelves)
filename = data_path + 'feature_matrix_reduced.npy'
feature_matrix_reduced = np.load(filename)
feature_matrix_reduced.shape

(10000, 1000)

In [208]:
# Read in goodreads data
ratings_goodreads = pd.read_csv(data_path + 'ratings.csv')
ratings_goodreads['book_id'] = ratings_goodreads['book_id'].astype(int)
users = list(ratings_goodreads.user_id.unique())
len(users)

53424

In [7]:
# Load in amazon ratings
ratings_amazon = pd.read_csv('../../ratings_amazon.csv')
ratings_amazon['book_id'] = ratings_amazon['book_id'].astype(int)

In [8]:
# Get a set of users that rated 5 or more items
user_counts = ratings_amazon['user_id'].value_counts() >= 5
to_drop = set()
for key,value in user_counts.items():
    if not value:
        to_drop.add(key)

In [10]:
# drop users from df that don't have 5 or more ratings
ratings_amazon_reduced = ratings_amazon[~ratings_amazon['user_id'].isin(to_drop)]
users = list(ratings_amazon_reduced.user_id.unique())
len(users)

53802

In [42]:
# combine goodreads ratings with amazon ratings (the 10k books)
print(ratings_goodreads.shape)
print(ratings_amazon_reduced.shape)
df = ratings_goodreads.append(ratings_amazon_reduced)
df = df.sort_values(by=['book_id'])
df.shape

(5976479, 3)
(612249, 3)


(6588728, 3)

In [55]:
user_item = get_sparse(df)

In [211]:
# Reduce dimensions of user-item matrix
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
user_item_reduced = svd.fit(user_item)

In [212]:
user_item_reduced.singular_values_[0:5]

array([ 2711.85778101,  1374.63824031,  1286.21427593,  1050.21715457,
         942.50510365])

In [213]:
# Get the item to concept matrix using the transpose of V from SVD
item_to_concept = user_item_reduced.components_.T

In [169]:
# Good old fashioned SVD
# U, Sigma, VT = randomized_svd(user_item, 
#                               n_components=100,
#                               n_iter=5,
#                               random_state=42)

In [None]:
# Scale Sigma from 0-1
# max_value = np.amax(Sigma) 
# Sigma_scaled = Sigma / max_value
# Sigma_scaled[0:10]
# Sigma[0:5]

In [200]:
# Scale VT by concept strengths
# VT_weighted_concept_strength = np.matmul(np.diag(Sigma_scaled), VT)
# item_to_concept = VT_weighted_concept_strength.T
# item_to_concept = VT.T
# item_to_concept.shape

In [214]:
# Horizontally combine book features and book concepts
item_matrix = np.hstack((item_to_concept, feature_matrix_reduced))
item_matrix.shape

(10000, 1100)

In [215]:
# Save new combined item to feature and concept matrix
filename = data_path + 'item_matrix.npy'
np.save(filename, item_matrix)