In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scipy
import math
import random
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD
from surprise import Reader, Dataset, SVD, evaluate, dump, accuracy
from collections import defaultdict

# Custom libraries
sys.path.append('../Util')
from loader import get_books, get_book_dataframe, get_book_features
from joiner import get_ratings, get_joint, load_amazon, load_goodreads
from reduction import reduce_matrix, get_sparse

In [2]:
def get_predictions(user_to_concept, V, user_bias):
    pred_ratings = np.zeros(len(V))
    for i in range(len(V)):
        pred = new_user_bias + np.dot(new_user_P, qi[i])
        pred_ratings[i] = pred
    return pred_ratings

In [3]:
def get_top_n_recs(result, books, n, q):
    recs = []
    for i in range(len(result)):
        if q[i] == 0: # book user hasn't already rated
            recs.append((i, result[i]))
        else:
            recs.append((i, float('-inf'))) 
            # recs.append((i, result[i])) #leave this to verify things actually working
    recs = sorted(recs, key=lambda tup: tup[1], reverse=True)

    top_titles = []
    for i in range(n):
        book_id = recs[i][0]
        title = books.iloc[book_id]['title']
        top_titles.append(title)
    return top_titles

In [4]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [5]:
# Get dataframe from books
books = get_book_dataframe(data_path)

found books_dataframe in file...


In [6]:
filename = '../.tmp/svd_100_300.npy'
qi = np.load(filename)

In [7]:
qi.shape

(10000, 300)

In [8]:
# user from goodreads
sparse_q = scipy.sparse.load_npz('../.tmp/cached_users/user_nickgreenquist.npz')
q = sparse_q.toarray()
q = np.array(q[0].tolist())
new_user = np.copy(q)
new_user

array([3, 3, 0, ..., 0, 0, 0])

In [9]:
# create array of indices of books this user has actually rated
indices = []
for i in range(len(new_user)):
    if new_user[i] != 0:
        indices.append(i)

In [46]:
# partial fit a new P and user_bias using trained Q and item_bias
learning_rate = 0.01
user_bias_reg = 0.1
P_reg = 0.01
iterations = 3000

# TODO: figure out real golab bias, for now let's say 4.0
global_bias = 4.0

# TODO: set n_factors using input
n_factors = 300

# TODO: set cols using input
cols = 10000

# TODO: save item biases after training with Surprise
item_bias = np.full(10000, 4.0);

In [47]:
# 1. get the user_bias for this user
new_user_bias = np.mean(new_user) - global_bias
new_user_bias

-3.9645

In [48]:
# 2. set up new random P
mu, sigma = 0, 0.1
new_user_P = np.random.normal(mu, sigma, n_factors)

In [49]:
# 3. computer small number of iterations of SGD
for iteration in range(iterations):
    # 3.1 calculate loss with current components
    errors = np.zeros(shape=(cols), dtype=np.float)
    for i in indices:
        rating = new_user[i]
        # pred = global_bias + new_user_bias + item_bias[i] + np.dot(new_user_P, qi[i])
        pred = new_user_bias + np.dot(new_user_P, qi[i])
        errors[i] = rating - pred

    # 3.2 periodically calculate total loss and output
    if iteration % (iterations / 10) == 0:
        total_loss = 0.0
        for j in indices:
            total_loss += pow(errors[j], 2)
        print("Loss at Iteration {}: {}".format(iteration, total_loss))

    # 3.3 run single SGD iteration
    new_user_P_target = np.copy(new_user_P)
    new_user_bias_target = new_user_bias
    
    # pick random book user has rated
    i = random.choice(indices)
    
    # update P
    for f in range(n_factors):
        p_update = learning_rate * (errors[i] * qi[i][f] - P_reg * new_user_P[f])
        new_user_P_target[f] += p_update

    # update user bias
    ub_update = learning_rate * (errors[i] - user_bias_reg * new_user_bias)
    new_user_bias_target += ub_update
    
    # 3.4 copy updated components back to original
    new_user_P = np.copy(new_user_P_target)
    new_user_bias = new_user_bias_target

Loss at Iteration 0: 6955.073276941169
Loss at Iteration 300: 241.07787923746454
Loss at Iteration 600: 213.50249897059186
Loss at Iteration 900: 203.1392873652598
Loss at Iteration 1200: 176.36352832644877
Loss at Iteration 1500: 165.50753391571743
Loss at Iteration 1800: 154.49330916410025
Loss at Iteration 2100: 153.50912696578317
Loss at Iteration 2400: 139.98865029371441
Loss at Iteration 2700: 128.27372118446223


In [50]:
recs = get_top_n_recs(get_predictions(new_user_P, qi, new_user_bias), books, 25, new_user)
for r in recs:
    print(r)

The Drunkard's Walk: How Randomness Rules Our Lives
Herzog
The Handmaid's Tale
Crash
How the Light Gets In (Chief Inspector Armand Gamache, #9)
Bad Monkey
Hogwarts: An Incomplete and Unreliable Guide (Pottermore Presents, #3)
And Then There Were None
The Next Always (Inn BoonsBoro, #1)
If You Could See Me Now
The Work of Art in the Age of Its Technological Reproducibility, and Other Writings on Media
Betrayal in Death (In Death, #12)
The Structure of Scientific Revolutions
The Hundred Thousand Kingdoms (Inheritance Trilogy, #1)
Gemina (The Illuminae Files, #2)
Mastering the Art of French Cooking
Until I Die (Revenants, #2)
ظل الأفعى
Frigid (Frigid, #1)
Breaking Night: A Memoir of Forgiveness, Survival, and My Journey from Homeless to Harvard
The One (The Selection, #3)
Clear and Present Danger (Jack Ryan Universe, #6)
Villette
Life Together: The Classic Exploration of Christian Community
Children of God (The Sparrow, #2)
