In [None]:
import os
import sys
import numpy as np
from numpy import genfromtxt
import pandas as pd
import scipy
import math
import random
from scipy import sparse
from collections import defaultdict

# Custom libraries
sys.path.append('../Util')
from loader import get_books, get_book_dataframe, get_book_features
from joiner import get_ratings, get_joint, load_amazon, load_goodreads
from reduction import reduce_matrix, get_sparse

In [None]:
def map_user_to_features(p, features):
    p_sparse = scipy.sparse.csr_matrix(p)
    # map new user to concept space by p*features
    user_to_concept = p_sparse.dot(features)
    # map user back to itme space with user_to_concept * featuresT
    result = user_to_concept.dot(features.T).todense()
    return result.T

In [None]:
def get_predictions(p, q, user_bias, item_bias, global_bias):
    pred_ratings = np.zeros(len(q))
    for i in range(len(q)):
        pred = global_bias + user_bias + item_bias[i] + np.dot(p, q[i])
        # pred = global_bias + user_bias + np.dot(p, q[i])
        pred_ratings[i] = pred
    return pred_ratings

In [None]:
def get_top_n_recs(result, books, n, q):
    recs = []
    for i in range(len(result)):
        if q[i] == 0: # book user hasn't already rated
            recs.append((i, result[i]))
        else:
            recs.append((i, float('-inf'))) 
            # recs.append((i, result[i])) #leave this to verify things actually working
    recs = sorted(recs, key=lambda tup: tup[1], reverse=True)

    top_titles = []
    for i in range(n):
        book_id = recs[i][0]
        title = books.iloc[book_id]['title']
        top_titles.append(title)
    return top_titles

In [None]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [None]:
# Get dataframe from books
books = get_book_dataframe(data_path)

In [None]:
# cu2rec components
filename = '../.tmp/goodbooks_sorted_f300'
q = genfromtxt('{}_q.csv'.format(filename), delimiter=',')
item_bias = genfromtxt('{}_item_bias.csv'.format(filename), delimiter=',')


# surprise components
# filename = '../.tmp/svd_100_300.npy'
# q = np.load(filename)
# filename = '../.tmp/Q_300.npy'
# q = np.load(filename)
# filename = '../.tmp/item_bias_300.npy'
# item_bias = np.load(filename)

In [None]:
# convert global bias to float - get from whatever dataset you used
global_bias = 3.919866

In [None]:
# user from goodreads
# sparse_new_user_scaled = scipy.sparse.load_npz('../.tmp/cached_users/user_likes_mystery_scifi_hates_fantasy.npz')
# sparse_new_user_scaled = scipy.sparse.load_npz('../.tmp/cached_users/user_likes_fantasy.npz')
sparse_new_user_scaled = scipy.sparse.load_npz('../.tmp/cached_users/user_nickgreenquist.npz')
new_user_ratings_scaled = sparse_new_user_scaled.toarray()
new_user_ratings_scaled = np.array(new_user_ratings_scaled[0].tolist())
new_user_ratings = np.copy(new_user_ratings_scaled)

In [None]:
# undo the rating mapping we usually do

# Turn 1-5 rating scale into negative - positive scale
# original mapper: ratings_mapper = {0:0, 1:-2, 2:-1, 3:1, 4:2, 5:3}
ratings_mapper = {0:0, -2:-1, -1:-2, 1:3, 2:4, 3:5}
for i in range(len(q)):
    new_user_ratings[i] = ratings_mapper[new_user_ratings_scaled[i]]
new_user_ratings

In [None]:
# create array of indices of books this user has actually rated
indices = []
for i in range(len(new_user_ratings)):
    if new_user_ratings[i] != 0:
        indices.append(i)
len(indices)

In [None]:
# Hyperparams
learning_rate = 0.07
user_bias_reg = 0.002
P_reg = 0.002

# updates per rating
iterations = len(indices) * 20

 # how many iterations to see the total loss at this step - remove in webapp!
calculate_total_loss = float('inf')

n_factors = q.shape[1]
cols = q.shape[0]

In [None]:
# 1. set the user_bias for this user
new_user_bias = 0

In [None]:
# 2. set up new random P
mu, sigma = 0, 0.1
p = np.random.normal(mu, (sigma / n_factors), n_factors)

In [None]:
# 3. computer small number of iterations of SGD
for iteration in range(iterations):
    
    #= periodically calculate total loss and output
    if iteration == 0 or iteration == iterations - 1 or iteration % calculate_total_loss == 0:
        total_loss = 0.0
        for i in indices:
            rating = new_user_ratings[i]
            pred = global_bias + new_user_bias + item_bias[i] + np.dot(p, q[i])
            # pred = global_bias + new_user_bias + np.dot(p, q[i])
            error = rating - pred
            total_loss += pow(error, 2)
            
        rmse = math.sqrt(total_loss / len(indices))
        print("RMSE at Iteration {}: {}".format(iteration, rmse))

    # Gradient Descent using every book - ucomment below to go back to SGD
    i = random.choice(indices)
#     for i in indices:
    
    # calculate loss on random item
    rating = new_user_ratings[i]
    pred = global_bias + new_user_bias + item_bias[i] + np.dot(p, q[i])
    # pred = global_bias + new_user_bias + np.dot(p, q[i])
    error = rating - pred

    # update P
    for f in range(n_factors):
        p_update = learning_rate * (error * q[i][f] - P_reg * p[f])
        p[f] += p_update

    # update user bias
    ub_update = learning_rate * (error - user_bias_reg * new_user_bias)
    new_user_bias += ub_update


In [None]:
# get predictions using partial fit
predictions_partial_fit = get_predictions(p, q, new_user_bias, item_bias, global_bias)

In [None]:
# print out top results using just partial fit predictions
recs_partial_fit = get_top_n_recs(predictions_partial_fit, books, 25, new_user_ratings)
for rec in recs_partial_fit:
    print(rec)

In [None]:
'''

Combine recs from partial fit with recs from mapping to feature matrix using log_rank

'''

In [None]:
# produce feature matrix
feature_matrix = get_book_features(books)

In [None]:
# get predictions using feature matrix
predictions_features = map_user_to_features(new_user_ratings, feature_matrix)

In [None]:
'''
Log Ranking
'''

In [None]:
# create tuple of book_id and rating for each method, then sort
partial_fit_ratings = []
feature_ratings = []
for i in range(len(books)):
    partial_fit_ratings.append((i, predictions_partial_fit[i]))
    feature_ratings.append((i, predictions_features[i]))

partial_fit_ratings = sorted(partial_fit_ratings, key=lambda x: x[1], reverse=True)
feature_ratings = sorted(feature_ratings, key=lambda x: x[1], reverse=True)

In [None]:
# map book_id to the rank for each method
id_to_rank_partial_fit = {}
id_to_rank_features = {}
for i in range(len(books)):
    book_id = partial_fit_ratings[i][0]
    id_to_rank_partial_fit[book_id] = math.log(i+1)

    book_id = feature_ratings[i][0]
    id_to_rank_features[book_id] = math.log(i+1)

In [None]:
weight_feature = 0.5

rankings = []
for i in range(len(books)):
    if new_user_ratings[i] == 0:
        rank = weight_feature*id_to_rank_features[i] + (1.0-weight_feature)*id_to_rank_partial_fit[i]
        rankings.append((rank, i))
rankings = sorted(rankings, key=lambda x: x[0])
print(len(rankings))

In [None]:
top_books = []
for i in range(100):
    book_id = rankings[i][1]
    book = books.iloc[book_id] # index is book_id - 1
    book['rank'] = i + 1
    top_books.append(book)

In [None]:
for book in top_books:
    print(book['title'])