In [1]:
import os
import sys
import numpy as np
from numpy import genfromtxt
import pandas as pd
import scipy
import math
import random
from scipy import sparse
from collections import defaultdict

# Custom libraries
sys.path.append('../Util')
from loader import get_books, get_book_dataframe, get_book_features
from joiner import get_ratings, get_joint, load_amazon, load_goodreads
from reduction import reduce_matrix, get_sparse

In [2]:
def map_user_to_features(p, features):
    p_sparse = scipy.sparse.csr_matrix(p)
    # map new user to concept space by p*features
    user_to_concept = p_sparse.dot(features)
    # map user back to itme space with user_to_concept * featuresT
    result = user_to_concept.dot(features.T).todense()
    return result.T

In [3]:
def get_predictions(p, q, user_bias, item_bias, global_bias):
    pred_ratings = np.zeros(len(q))
    for i in range(len(q)):
        pred = global_bias + user_bias + item_bias[i] + np.dot(p, q[i])
        # pred = global_bias + user_bias + np.dot(p, q[i])
        pred_ratings[i] = pred
    return pred_ratings

In [4]:
def get_top_n_recs(result, books, n, q):
    recs = []
    for i in range(len(result)):
        if q[i] == 0: # book user hasn't already rated
            recs.append((i, result[i]))
        else:
            recs.append((i, float('-inf'))) 
            # recs.append((i, result[i])) #leave this to verify things actually working
    recs = sorted(recs, key=lambda tup: tup[1], reverse=True)

    top_titles = []
    for i in range(n):
        book_id = recs[i][0]
        title = books.iloc[book_id]['title']
        top_titles.append(title)
    return top_titles

In [5]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [6]:
# Get dataframe from books
books = get_book_dataframe(data_path)

found books_dataframe in file...


In [7]:
# cu2rec components
filename = '../.tmp/goodbooks_sorted_f300'
q = genfromtxt('{}_q.csv'.format(filename), delimiter=',')
item_bias = genfromtxt('{}_item_bias.csv'.format(filename), delimiter=',')


# surprise components
# filename = '../.tmp/svd_100_300.npy'
# q = np.load(filename)
# filename = '../.tmp/Q_300.npy'
# q = np.load(filename)
# filename = '../.tmp/item_bias_300.npy'
# item_bias = np.load(filename)

In [8]:
# convert global bias to float - get from whatever dataset you used
global_bias = 3.919866

In [84]:
# user from goodreads
# sparse_new_user_scaled = scipy.sparse.load_npz('../.tmp/cached_users/user_likes_mystery_scifi_hates_fantasy.npz')
# sparse_new_user_scaled = scipy.sparse.load_npz('../.tmp/cached_users/user_likes_fantasy.npz')
sparse_new_user_scaled = scipy.sparse.load_npz('../.tmp/cached_users/user_nickgreenquist.npz')
new_user_ratings_scaled = sparse_new_user_scaled.toarray()
new_user_ratings_scaled = np.array(new_user_ratings_scaled[0].tolist())
new_user_ratings = np.copy(new_user_ratings_scaled)

In [101]:
# undo the rating mapping we usually do

# Turn 1-5 rating scale into negative - positive scale
# original mapper: ratings_mapper = {0:0, 1:-2, 2:-1, 3:1, 4:2, 5:3}
ratings_mapper = {0:0, -2:-1, -1:-2, 1:3, 2:4, 3:5}
for i in range(len(q)):
    new_user_ratings[i] = ratings_mapper[new_user_ratings_scaled[i]]
new_user_ratings

array([5, 5, 0, ..., 0, 0, 0])

In [102]:
# create array of indices of books this user has actually rated
indices = []
for i in range(len(new_user_ratings)):
    if new_user_ratings[i] != 0:
        indices.append(i)
len(indices)

202

In [103]:
# Hyperparams
learning_rate = 0.07
user_bias_reg = 0.002
P_reg = 0.0002

# updates per rating
iterations = 25

 # how many iterations to see the total loss at this step - remove in webapp!
calculate_total_loss = float('inf')

n_factors = q.shape[1]
cols = q.shape[0]

In [104]:
# 1. set the user_bias for this user
new_user_bias = 0

In [105]:
# 2. set up new random P
mu, sigma = 0, 0.1
p = np.random.normal(mu, (sigma / n_factors), n_factors)

In [106]:
%%time
# 3. computer small number of iterations of SGD
for iteration in range(iterations):
    
    #= periodically calculate total loss and output
    if iteration == 0 or iteration == iterations - 1 or iteration % calculate_total_loss == 0:
        total_loss = 0.0
        for i in indices:
            rating = new_user_ratings[i]
            pred = global_bias + new_user_bias + item_bias[i] + np.dot(p, q[i])
            # pred = global_bias + new_user_bias + np.dot(p, q[i])
            error = rating - pred
            total_loss += pow(error, 2)
            
        rmse = math.sqrt(total_loss / len(indices))
        print("RMSE at Iteration {}: {}".format(iteration, rmse))

    # i = random.choice(indices) # SGD
    for i in indices: # Gradient Descent using every book per iteration
    
        # calculate loss on random item
        rating = new_user_ratings[i]
        pred = global_bias + new_user_bias + item_bias[i] + np.dot(p, q[i])
        # pred = global_bias + new_user_bias + np.dot(p, q[i])
        error = rating - pred

        # update P
        for f in range(n_factors):
            p_update = learning_rate * (error * q[i][f] - P_reg * p[f])
            p[f] += p_update

        # update user bias
        ub_update = learning_rate * (error - user_bias_reg * new_user_bias)
        new_user_bias += ub_update


RMSE at Iteration 0: 1.7627478538334462
RMSE at Iteration 24: 0.3261305490339041
Wall time: 1.69 s


In [107]:
# get predictions using partial fit
predictions_partial_fit = get_predictions(p, q, new_user_bias, item_bias, global_bias)

In [108]:
# print out top results using just partial fit predictions
recs_partial_fit = get_top_n_recs(predictions_partial_fit, books, 25, new_user_ratings)
for rec in recs_partial_fit:
    print(rec)

The School of Essential Ingredients
The Count of Monte Cristo
Gabriel's Inferno (Gabriel's Inferno, #1)
You Can Heal Your Life
Your Inner Fish: A Journey into the 3.5-Billion-Year History of the Human Body
Someday, Someday, Maybe
Ahab's Wife, or The Star-Gazer
American Pastoral (The American Trilogy, #1)
West with the Night
These Is My Words: The Diary of Sarah Agnes Prine, 1881-1901, Arizona Territories (Sarah Agnes Prine, #1)
Small Gods (Discworld, #13)
The Black Stallion (The Black Stallion, #1)
Thirteen Reasons Why
Where the Mountain Meets the Moon
The 4-Hour Workweek
Stranger in a Strange Land
Rubyfruit Jungle
The Book of Mormon: Another Testament of Jesus Christ
The End of Eternity
Not Without My Daughter
Definitely Dead (Sookie Stackhouse, #6)
Crown Duel (Crown & Court #1-2)
The Year of Living Biblically: One Man's Humble Quest to Follow the Bible as Literally as Possible
Comfort Me with Apples: More Adventures at the Table
My Booky Wook


In [92]:
'''

Combine recs from partial fit with recs from mapping to feature matrix using log_rank

'''

'\n\nCombine recs from partial fit with recs from mapping to feature matrix using log_rank\n\n'

In [93]:
# produce feature matrix
feature_matrix = get_book_features(books)

feature_matrix exists in file...


In [94]:
# get predictions using feature matrix
predictions_features = map_user_to_features(new_user_ratings, feature_matrix)

In [95]:
'''
Log Ranking
'''

'\nLog Ranking\n'

In [96]:
# create tuple of book_id and rating for each method, then sort
partial_fit_ratings = []
feature_ratings = []
for i in range(len(books)):
    partial_fit_ratings.append((i, predictions_partial_fit[i]))
    feature_ratings.append((i, predictions_features[i]))

partial_fit_ratings = sorted(partial_fit_ratings, key=lambda x: x[1], reverse=True)
feature_ratings = sorted(feature_ratings, key=lambda x: x[1], reverse=True)

In [97]:
# map book_id to the rank for each method
id_to_rank_partial_fit = {}
id_to_rank_features = {}
for i in range(len(books)):
    book_id = partial_fit_ratings[i][0]
    id_to_rank_partial_fit[book_id] = math.log(i+1)

    book_id = feature_ratings[i][0]
    id_to_rank_features[book_id] = math.log(i+1)

In [98]:
weight_feature = 0.5

rankings = []
for i in range(len(books)):
    if new_user_ratings[i] == 0:
        rank = weight_feature*id_to_rank_features[i] + (1.0-weight_feature)*id_to_rank_partial_fit[i]
        rankings.append((rank, i))
rankings = sorted(rankings, key=lambda x: x[0])
print(len(rankings))

9798


In [99]:
top_books = []
for i in range(100):
    book_id = rankings[i][1]
    book = books.iloc[book_id]
    book['rank'] = i + 1
    top_books.append(book)

In [100]:
for book in top_books:
    print(book['title'])

The Count of Monte Cristo
The Sword in the Stone (The Once and Future King, #1)
Your Inner Fish: A Journey into the 3.5-Billion-Year History of the Human Body
Grendel
The Amulet of Samarkand (Bartimaeus, #1)
The Neverending Story
Dandelion Wine (Green Town, #1)
The Lost Gate (Mither Mages, #1)
The Earthsea Trilogy
Someday, Someday, Maybe
The End of Eternity
Flowers for Algernon
A Man in Full
A Wrinkle in Time (A Wrinkle in Time Quintet, #1)
The Alchemyst (The Secrets of the Immortal Nicholas Flamel, #1)
Stranger in a Strange Land
A Wizard of Earthsea (Earthsea Cycle, #1)
The Magician's Land (The Magicians, #3)
J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings
Crown Duel (Crown & Court #1-2)
The Queen of Attolia (The Queen's Thief, #2)
An Acceptable Time (A Wrinkle in Time Quintet, #5)
An Ember in the Ashes (An Ember in the Ashes, #1)
Where the Mountain Meets the Moon
Alcatraz Versus the Evil Librarians (Alcatraz, #1)
The Blade Itself (The First Law, #1)
West with th