In [1]:
import sys
import numpy as np
import pandas as pd
import scipy
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from surprise import Reader, Dataset, SVD, evaluate, dump, accuracy

# Custom libraries
sys.path.append('../Util')
from reduction import get_sparse
from loader import get_book_dataframe, get_book_features
from joiner import get_ratings, get_joint
from reduction import reduce_matrix, get_sparse

In [2]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [3]:
# Get dataframe from books
books = get_book_dataframe(data_path)

found books_dataframe in file...


In [4]:
tfidf = TfidfVectorizer(stop_words='english')

In [5]:
tfidf_matrix_description = tfidf.fit_transform(books['description'])
tfidf_matrix_description.shape

(10000, 59713)

In [6]:
tfidf_matrix_shelves = tfidf.fit_transform(books['popular_shelves'])
tfidf_matrix_shelves.shape

(10000, 11245)

In [7]:
tfidf_matrix_tags = tfidf.fit_transform(books['tags'])
tfidf_matrix_tags.shape

(10000, 11245)

In [8]:
# Weight the smaller matrices by ratio to largest column matrix
shelves_weight = tfidf_matrix_description.shape[1] / tfidf_matrix_shelves.shape[1]
tags_weight = tfidf_matrix_description.shape[1] / tfidf_matrix_tags.shape[1]

In [9]:
tfidf_matrix_shelves = tfidf_matrix_shelves.multiply(shelves_weight)
tfidf_matrix_tags = tfidf_matrix_tags.multiply(tags_weight)

In [10]:
feature_matrix = scipy.sparse.hstack([tfidf_matrix_description, tfidf_matrix_shelves, tfidf_matrix_tags])

print('printing feature_matrix to file')
scipy.sparse.save_npz('../.tmp/feature_matrix', feature_matrix)

printing feature_matrix to file


In [43]:
# SVD on full features to calculate sum of eigen values
U, E, V = reduce_matrix(feature_matrix, n_components=3000)

In [44]:
total_eigen_values = 0
for e in E:
    total_eigen_values += (e*e)
total_eigen_values

567667.43158802704

In [25]:
features_U, E_reduced, _ = reduce_matrix(feature_matrix, n_components=300)

In [45]:
reduced_eigen_values = 0
for e in E_reduced:
    reduced_eigen_values += (e*e)
reduced_eigen_values

509514.40645793889

In [46]:
information_loss = reduced_eigen_values/total_eigen_values
information_loss

0.89755793287734087

In [29]:
# Save the reduced feature matrix to save time elsewhere
filename = '../.tmp/feature_matrix_1000.npy'
np.save(filename, features_U)