In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
%matplotlib inline
%load_ext watermark
%load_ext autoreload 
%autoreload 2

from subprocess import call

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Ethen 2017-03-18 20:47:47 

CPython 3.5.2
IPython 4.2.0

numpy 1.12.0
pandas 0.19.2
matplotlib 2.0.0
scipy 0.18.1


In [2]:
file_dir = 'ml-100k'
file_path = os.path.join(file_dir, 'u.data')
if not os.path.isdir(file_dir):
    call(['curl', '-O', 'http://files.grouplens.org/datasets/movielens/' + file_dir + '.zip'])
    call(['unzip', file_dir + '.zip'])

names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(file_path, sep = '\t', names = names)
print('data dimension: \n', df.shape)

# we'll later see why we're printing the memory usage information
print('\nmemory usage: \n', df.memory_usage(deep = True))
df.head()

data dimension: 
 (100000, 4)

memory usage: 
 Index            80
user_id      800000
item_id      800000
rating       800000
timestamp    800000
dtype: int64


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [34]:
from recsys.utils import create_matrix

# pass the DataFrame and names of the user, item 
# and ratings columns
users_col = 'user_id'
items_col = 'item_id'
ratings_col = 'rating'
X, df = create_matrix(df, users_col, items_col, ratings_col)
X

Raw data
Number of users: 943
Number of items: 1682
Sparsity: 6.305%


<943x1682 sparse matrix of type '<class 'numpy.float32'>'
	with 100000 stored elements in Compressed Sparse Row format>

In [69]:
from recsys.model_selection import create_train_test

seed = 1234
test_size = 0.2
X_train, X_test = create_train_test(X, test_size, seed)
X_train

<943x1682 sparse matrix of type '<class 'numpy.float32'>'
	with 79619 stored elements in Compressed Sparse Row format>

In [79]:
from tqdm import trange

In [90]:
class BPR:
    
    def __init__(self, learning_rate, n_iters, n_factors, reg, seed):
        self.reg = reg
        self.seed = seed
        self.n_iters = n_iters
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        
    def fit(self, X):
        n_users, n_items = X.shape
        rstate = np.random.RandomState(self.seed)
        self.user_factors = rstate.normal(size = (n_users, self.n_factors))
        self.item_factors = rstate.normal(size = (n_items, self.n_factors))

        for i in trange(self.n_iters):
            for user in range(n_users):
                self._update(X, n_users, n_items)

        # to avoid re-computation at predict
        self._predicted = False
        return self
                
    def _update(self, X, n_users, n_items):
        # select the triplet uniformly at random
        u = np.random.choice(n_users, 1)
        indptr = X[u].indptr
        indices = X[u].indices

        all_indices = np.arange(n_items)
        positive_indices = indices[indptr[0]:indptr[1]]
        negative_indices = np.where( ~np.in1d(all_indices, positive_indices) )[0]
        i = np.random.choice(positive_indices, 1)
        j = np.random.choice(negative_indices, 1)

        # decompose the estimator
        x_ui = self.user_factors[u].dot(self.item_factors[i].T) 
        x_uj = self.user_factors[u].dot(self.item_factors[j].T)
        x_uij = np.ravel(x_ui - x_uj)

        # update
        sigmoid = 1.0 / (1.0 + np.exp(-x_uij))
        gradient_u = sigmoid * (self.item_factors[i] - self.item_factors[j]) - self.reg * self.user_factors[u]
        gradient_i = sigmoid * self.user_factors[u] - self.reg * self.item_factors[i]
        gradient_j = sigmoid * -self.user_factors[u] - self.reg * self.item_factors[j]
        self.user_factors[u] += self.learning_rate * gradient_u
        self.item_factors[i] += self.learning_rate * gradient_i
        self.item_factors[j] += self.learning_rate * gradient_j
        return self

    def predict(self):
        """
        Obtain the predicted ratings for every users and items
        by doing a dot product of the learnt user and item vectors.
        The result will be cached to avoid re-computing 
        it every time we call predict, thus there will
        only be an overhead the first time we call it.
        Note, ideally you probably don't need to compute 
        this as it returns a dense matrix and may take
        up huge amounts of memory for large datasets
        """
        if not self._predicted:
            self._get_prediction()
            self._predicted = True

        return self._pred

    def _get_prediction(self):
        """Predicted ratings (dot product of user and item vectors)"""
        self._pred = self.user_factors.dot(self.item_factors.T)
        return self

    def _predict_user(self, user):
        """
        returns the predicted ratings for the specified user,
        this is mainly used in computing evaluation metric,
        where we avoid computing the whole predicted rating matrix
        TODO : do we even need this in the class?
        """
        user_pred = self.user_factors[user].dot(self.item_factors.T)
        return user_pred

In [91]:
reg = 0.1
seed = 1234
n_factors = 10
n_iterations = 10
learning_rate = 0.1
bpr = BPR(learning_rate, n_iterations, n_factors, reg, seed)
bpr.fit(X_train)

100%|██████████| 10/10 [00:07<00:00,  1.39it/s]


<__main__.BPR at 0x11a1242b0>

In [92]:
from recsys.metrics import mapk_score

k = 5
mapk_train = mapk_score(bpr, X_train, k)
mapk_test = mapk_score(bpr, X_test, k)
print('training:', mapk_train)
print('testing:', mapk_test)

training: 0.05007776599505132
testing: 0.007592788971367972


In [71]:
from sklearn.metrics import mean_squared_error

def compute_mse(ratings):
    mask = ratings.nonzero()
    y_true = ratings.data
    prediction = user_factors.dot(self.item_factors.T)
    y_pred = prediction[mask]
    mse = mean_squared_error(y_true, y_pred)
    return mse

In [75]:
X_train.toarray()

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [73]:
def _compute_apk(y_true, y_pred, k):
    """
    average precision at k, y_pred is assumed 
    to be truncated to length k prior to feeding
    it to the function
    """
    # convert to set since membership 
    # testing in a set is vastly faster
    actual = set(y_true)
    
    # precision at i is a percentage of correct 
    # items among first i recommendations; the
    # correct count will be summed up by n_hit
    n_hit = 0
    precision = 0
    for i, p in enumerate(y_pred, 1):
        if p in actual:
            n_hit += 1
            precision += n_hit / i

    # divide by recall at the very end
    avg_precision = precision / min(len(actual), k)
    return avg_precision


def mapk_score(estimator, ratings, k):
    """
    mean average precision at k for the ALS model;
    TODO: parallelize the computation
    """
    # compare the top k predictions' index to the actual index,
    # the estimator is assumed to have the _predict_user method
    mapk = 0
    n_users = ratings.shape[0]
    for u in range(n_users):
        y_true = ratings[u].indices
        u_pred = estimator._predict_user(u)
        y_pred = np.argsort(u_pred)[::-1][:k]
        mapk += _compute_apk(y_true, y_pred, k)

    mapk /= n_users
    return mapk

array([[-0.09914463, -5.02095942, -4.03751762, ..., -2.19622796,
        -1.4127792 ,  0.20080199],
       [-2.80288287, -1.62467518, -1.23378673, ..., -9.28956822,
        -2.40644468, -4.567055  ],
       [-1.80605017, -3.98887743,  1.31625954, ..., -0.30670508,
         1.9203558 ,  4.10763957],
       ..., 
       [ 4.21659749, -2.19927371, -2.57470449, ..., -1.66798734,
        -9.33224649,  6.97416757],
       [ 3.32952222,  2.22922845,  1.25110668, ..., -3.82371835,
        -3.41696562,  6.26238452],
       [ 0.30419376,  0.42761336,  0.95184887, ..., -6.03760303,
         2.9253063 , -4.27824216]])

In [72]:
train_mse = compute_mse(X_train)
test_mse = compute_mse(X_test)
print( 'training mse {:.1f}, testing mse {:.1f}'.format(train_mse, test_mse) )

training mse 29.2, testing mse 29.1
