In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scipy
import math
import random
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD
from surprise import Reader, Dataset, SVD, evaluate, dump, accuracy
from collections import defaultdict

# Custom libraries
sys.path.append('../Util')
from loader import get_books, get_book_dataframe, get_book_features
from joiner import get_ratings, get_joint, load_amazon, load_goodreads
from reduction import reduce_matrix, get_sparse

In [2]:
def get_predictions(user_to_concept, V, user_bias, global_bias):
    pred_ratings = np.zeros(len(V))
    for i in range(len(V)):
        pred = global_bias + new_user_bias + np.dot(new_user_P, qi[i])
        pred_ratings[i] = pred
    return pred_ratings

In [3]:
def get_top_n_recs(result, books, n, q):
    recs = []
    for i in range(len(result)):
        if q[i] == 0: # book user hasn't already rated
            recs.append((i, result[i]))
        else:
            recs.append((i, float('-inf'))) 
            # recs.append((i, result[i])) #leave this to verify things actually working
    recs = sorted(recs, key=lambda tup: tup[1], reverse=True)

    top_titles = []
    for i in range(n):
        book_id = recs[i][0]
        title = books.iloc[book_id]['title']
        top_titles.append(title)
    return top_titles

In [4]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [5]:
# Get dataframe from books
books = get_book_dataframe(data_path)

found books_dataframe in file...


In [6]:
filename = '../.tmp/svd_100_300.npy'
qi = np.load(filename)

In [7]:
qi.shape

(10000, 300)

In [55]:
# user from goodreads
# sparse_q = scipy.sparse.load_npz('../.tmp/cached_users/user_likes_mystery_scifi_hates_fantasy.npz')
# sparse_q = scipy.sparse.load_npz('../.tmp/cached_users/user_likes_fantasy.npz')
sparse_q = scipy.sparse.load_npz('../.tmp/cached_users/user_nickgreenquist.npz')
q = sparse_q.toarray()
q = np.array(q[0].tolist())
new_user = np.copy(q)

In [56]:
# undo the rating mapping we usually do

# Turn 1-5 rating scale into negative - positive scale
# original mapper: ratings_mapper = {0:0, 1:-2, 2:-1, 3:1, 4:2, 5:3}
ratings_mapper = {0:0, -2:-1, -1:-2, 1:3, 2:4, 3:5}
for i in range(len(q)):
    new_user[i] = ratings_mapper[new_user[i]]
new_user

array([5, 5, 0, ..., 0, 0, 0])

In [57]:
# create array of indices of books this user has actually rated
indices = []
for i in range(len(new_user)):
    if new_user[i] != 0:
        indices.append(i)

In [58]:
# Hyperparams
learning_rate = 0.01
user_bias_reg = 0.1
P_reg = 0.001
global_bias = 3.946136

# 50 updates per rated book
iterations = len(indices) * 25

 # see total loss 10 times
show_total_loss = iterations / 10

n_factors = qi.shape[1]
cols = qi.shape[0]

# TODO: save item biases after training with Surprise
item_bias = np.full(10000, 4.0);

In [59]:
# 1. get the user_bias for this user
new_user_bias = 0

In [60]:
# 2. set up new random P
mu, sigma = 0, 0.1
new_user_P = np.random.normal(mu, (sigma / n_factors), n_factors)

In [61]:
# 3. computer small number of iterations of SGD
for iteration in range(iterations):
    
    #= periodically calculate total loss and output
    if iteration == 0 or iteration == iterations - 1:
        total_loss = 0.0
        for i in indices:
            rating = new_user[i]
            pred = global_bias + new_user_bias + np.dot(new_user_P, qi[i])
            error = rating - pred
            total_loss += pow(error, 2)
        print("Loss at Iteration {}: {}".format(iteration, total_loss))

    # run single SGD iteration
    i = random.choice(indices)
    
    # calculate loss on random item
    rating = new_user[i]
    pred = global_bias + new_user_bias + np.dot(new_user_P, qi[i])
    error = rating - pred

    # update P
    for f in range(n_factors):
        p_update = learning_rate * (error * qi[i][f] - P_reg * new_user_P[f])
        new_user_P[f] += p_update

    # update user bias
    ub_update = learning_rate * (error - user_bias_reg * new_user_bias)
    new_user_bias += ub_update


Loss at Iteration 0: 633.5371021034617
Loss at Iteration 5049: 194.445806435055


In [62]:
recs = get_top_n_recs(get_predictions(new_user_P, qi, new_user_bias, global_bias), books, 25, new_user)
for r in recs:
    print(r)

The Drunkard's Walk: How Randomness Rules Our Lives
Betrayal in Death (In Death, #12)
Here's to Us
Hogwarts: An Incomplete and Unreliable Guide (Pottermore Presents, #3)
Life Together: The Classic Exploration of Christian Community
Wool (Wool, #1)
The Amateur Marriage
The Work of Art in the Age of Its Technological Reproducibility, and Other Writings on Media
Bad Monkey
Blood Memory
The Hundred Thousand Kingdoms (Inheritance Trilogy, #1)
How the Light Gets In (Chief Inspector Armand Gamache, #9)
Herzog
Crash
ظل الأفعى
The Handmaid's Tale
Native Son
Breaking Night: A Memoir of Forgiveness, Survival, and My Journey from Homeless to Harvard
MacRieve (Immortals After Dark, #14)
13 Gifts (Willow Falls, #3)
Suzanne's Diary for Nicholas
The Casual Vacancy
The Third Wheel (Diary of a Wimpy Kid, #7)
If You Could See Me Now
The Marvels
