In [1]:
from polara import RecommenderData
from polara import SVDModel
from polara import get_movielens_data
from polara.tools.preprocessing import filter_sessions_by_length
from polara.evaluation import evaluation_engine as ee
import numpy as np
import scipy.sparse as SP
from io import BytesIO
import pandas as pd

import numpy as np, scipy.stats as st
import numpy as np
import scipy as sp
import scipy.stats



In [None]:
ml_train_items = np.load("ml_train_items.npy")
ml_train_mask = np.load("ml_train_mask.npy")
ml_train_users = np.load("ml_train_users.npy")
ml_val_items = np.load("ml_val_items.npy")
ml_val_mask = np.load("ml_val_mask.npy")
ml_val_users = np.load("ml_val_users.npy")
ml_test_items = np.load("ml_test_items.npy")
ml_test_mask = np.load("ml_test_mask.npy")
ml_test_users = np.load("ml_test_users.npy")
ml_train_user_idx = np.load('ml_train_user_idx.npy')
ml_train_item_idx = np.load('ml_train_item_idx.npy')
ml_train_feedback = np.load('ml_train_feedback.npy')

In [2]:
lf_train_items = np.load("lf_train_items.npy")
lf_train_mask = np.load("lf_train_mask.npy")
lf_train_users = np.load("lf_train_users.npy")
lf_val_items = np.load("lf_val_items.npy")
lf_val_mask = np.load("lf_val_mask.npy")
lf_val_users = np.load("lf_val_users.npy")
lf_test_items = np.load("lf_test_items.npy")
lf_test_mask = np.load("lf_test_mask.npy")
lf_test_users = np.load("lf_test_users.npy")
lf_train_user_idx = np.load('lf_train_user_idx.npy')
lf_train_item_idx = np.load('lf_train_item_idx.npy')
lf_train_feedback = np.load('lf_train_feedback.npy')

In [5]:
!pip3 install --upgrade git+https://github.com/Evfro/polara.git@develop#egg=polara

Collecting git+https://github.com/Evfro/polara.git@develop
  Cloning https://github.com/Evfro/polara.git (to revision develop) to /tmp/pip-req-build-0a3rx0t8
Building wheels for collected packages: polara
  Running setup.py bdist_wheel for polara ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-x7hk2c0d/wheels/95/b2/f8/18e769bc21d1fc5323b933f0ab7261b9521a589243f7549bf4
Successfully built polara
Installing collected packages: polara
  Found existing installation: polara 0.5.3
    Uninstalling polara-0.5.3:
      Successfully uninstalled polara-0.5.3
Successfully installed polara-0.5.3


In [2]:
def remove_gaps(data):
    data['movieid'] = ml_data.groupby('movieid', sort=False).grouper.group_info[0]
    data['userid'] = ml_data.groupby('userid', sort=False).grouper.group_info[0]
    return data

In [3]:
def normalize_timestamp(x):
    x["timestamp"] = np.argsort(list(x["timestamp"]))
    return x

def length_col(x):
    x['timestamp'] = len(x)
    return x

def train_test_val_split(data):
    
    data = data.groupby("userid").apply(normalize_timestamp)
    lc = data.groupby("userid").apply(length_col)
    max_time_stamp = lc['timestamp']
    timestamp = data['timestamp']
    data_train = data[timestamp<max_time_stamp*0.9]\
                    .groupby("userid").apply(normalize_timestamp)
    data_val = data[(0.9*max_time_stamp<=timestamp)&(timestamp<0.95*max_time_stamp)]\
                    .groupby("userid").apply(normalize_timestamp)
    data_test = data[0.95*max_time_stamp<=timestamp]\
                    .groupby("userid").apply(normalize_timestamp)
    return data_train, data_val,data_test

In [4]:
def to_matrices(data):
    data = split_by_groups(data)
    
    data_max_order = data['timestamp'].max()
    data = data.groupby("index").apply(move_timestamps_to_end,data_max_order)

    data_shape = data[['index', 'timestamp']].max()+1
    data_matrix = sp.sparse.csr_matrix((data['itemid'],
                                   (data['index'], data['timestamp'])),
                                    shape=data_shape, dtype=np.float64).todense()
    mask_matrix = sp.sparse.csr_matrix((np.ones(len(data)),
                                   (data['index'], data['timestamp'])),
                                    shape=data_shape, dtype=np.float64).todense()
    
    data_users = data.drop_duplicates(['index'])
    user_data_shape = data_users['index'].max()+1
    user_vector = sp.sparse.csr_matrix((data_users['userid'],
                                   (data_users['index'],np.zeros(user_data_shape))),
                                    shape=(user_data_shape,1), dtype=np.float64).todense()
    user_matrix = np.tile(user_vector,(1,data_shape[1]))
    return data_matrix, mask_matrix, user_matrix

In [6]:
ml_data = get_movielens_data("ml-10m.zip",include_time=True)
ml_data = remove_gaps(ml_data)

In [7]:
ml_data_train, ml_data_val,ml_data_test = train_test_val_split(ml_data)

In [67]:
um_ml_data_train, um_ml_data_val,um_ml_data_test =\
                    ml_data_train[["userid","movieid","rating"]],\
                    ml_data_val[["userid","movieid","rating"]],\
                    ml_data_test[["userid","movieid","rating"]]

In [68]:
ml_time_train, ml_time_val,ml_time_test =\
                    ml_data_train[["timestamp"]],\
                    ml_data_val[["timestamp"]],\
                    ml_data_test[["timestamp"]]

In [77]:
data_model = RecommenderData(None, 'userid', 'movieid', 'rating', seed=0)
data_model.holdout_size = 1
data_model.random_holdout = False
data_model.warm_start = False
data_model.permute_tops = False

In [75]:
data_model.prepare_training_only()

Preparing data...
Done.


In [None]:
class RecurentModel(RecommenderModel):
    def __init__(self, train ,*args, **kwargs):
        super(RecurentModel, self).__init__(None, 'userid', 'movieid', 'rating', *args, **kwargs)
        
        self.method = 'RecommenderModel' # pick some meaningful name

    def build(self):
        # build model - calculate item-to-item matrix
        user_item_matrix = self.get_training_matrix()
        # rating matrix product  R^T R  gives cooccurrences count
        i2i_matrix = user_item_matrix.T.dot(user_item_matrix) # gives CSC format
        # exclude "self-links" and ensure only non-zero elements are stored
        i2i_matrix.setdiag(0)
        i2i_matrix.eliminate_zeros()
        # store matrix for generating recommendations
        self.i2i_matrix = i2i_matrix

    def get_recommendations(self):
        # get test users information and generate top-k recommendations
        test_matrix, test_data = self.get_test_matrix()
        # calculate predicted scores
        i2i_scores = test_matrix.dot(self.i2i_matrix)
        # prevent seen items from appearing in recommendations
        if self.filter_seen:
            self.downvote_seen_items(i2i_scores, test_data)
        # generate top-k recommendations for every test user
        top_recs = self.get_topk_elements(i2i_scores)
        return top_recs

In [3]:
from polara.lib.optimize import sgd_step, sgd_step_biased

def basic_matrix_factorization(user_idx, item_idx, feedback,
                               rank=10, lrate=0.01, reg=0.1,
                               num_epochs=30, tol=1e-4, biased=False,
                               seed=None, verbose=True):
    n_users = user_idx.max() + 1
    n_items = item_idx.max() + 1
   
    random_state = np.random.RandomState(seed) if seed else np.random
    P = random_state.normal(scale=0.1, size=(n_users, rank))
    Q = random_state.normal(scale=0.1, size=(n_items, rank))
   
    if biased:
        t = random_state.normal(scale=0.1, size=n_users)
        f = random_state.normal(scale=0.1, size=n_items)
        m = np.mean(feedback[np.where(feedback != 0)])
        biases = [t, f, m]
    else:
        biases = []
       
    basic_sgd_step = sgd_step_biased if biases else sgd_step
 
    last_err = np.finfo(np.float64).max
    for epoch in range(num_epochs):
        new_err = basic_sgd_step(user_idx, item_idx, feedback, P, Q, *biases, lrate, reg)       
        err_delta = abs(last_err - new_err) / last_err
       
        if verbose:
            rmse = np.sqrt(new_err / len(feedback))
            print('Epoch {} RMSE: {}'.format(epoch+1, rmse))
       
        last_err = new_err
        if err_delta < tol: break
    return P, Q, biases

In [4]:
from numba import jit

@jit(nopython=True, nogil=True)
def sgd_step_update(users_idx, items_idx, feedbacks, P, Q, eta, lambd):
    cum_error = 0
    for k, a in enumerate(feedbacks):
        i = users_idx[k]
        j = items_idx[k]

        pi = P[i, :]
        qj = Q[j, :]

        e = a - np.dot(pi, qj)

        new_pi = pi + eta * (e*qj - lambd*pi)
        #new_qj = qj + eta * (e*pi - lambd*qj)

        P[i, :] = new_pi
        #Q[j, :] = new_qj

        cum_error += e*e
    return cum_error

def basic_matrix_factorization_folding_in(user_idx, item_idx, feedback,P,Q,
                               lrate=0.01, reg=0.1,
                               num_epochs=30, tol=1e-4, verbose=True):
    last_err = np.finfo(np.float64).max
    for epoch in range(num_epochs):
        new_err = sgd_step_update(user_idx, item_idx, feedback, P, Q, lrate, reg)       
        err_delta = abs(last_err - new_err) / last_err
       
        if verbose:
            rmse = np.sqrt(new_err / len(feedback))
            print('Epoch {} RMSE: {}'.format(epoch+1, rmse))
       
        last_err = new_err
        if err_delta < tol: break
    return P, Q

            
    

In [5]:
def prepare_data(items, mask, users):
    targets = items[:,-1]
    items = [items[i,:-1][mask[i,:-1]==1] for i in range(items.shape[0])]
    users = [users[i][mask[i]==1] for i in range(users.shape[0])]
    return items,users,targets

def mean_confidence_interval(data, confidence=0.95,num_parts = 5):
    part_len = len(data)//num_parts
    estimations = []
    for i in range(num_parts):
        est = np.mean(data[part_len*i:part_len*(i+1)])
        estimations.append(est)
    a = 1.0*np.array(estimations)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
    return m, h

#@jit(nopython=True, nogil=True)
def estimate_model(P,Q,items,masks,users,reg = 0.1):
    items,users,targets  = prepare_data(items, masks, users)
    mrrs = []
    recalls = []
   
    for row_inx in range(len(items)):
        new_P = P.copy()
        
        feedback = np.ones(items[row_inx].shape[0])
        if len(feedback)!=0:
            basic_matrix_factorization_folding_in(users[row_inx][:-1].astype(np.int32), items[row_inx].astype(np.int32),feedback,new_P,Q.copy(),
                                   lrate=0.01, reg=reg,
                                   num_epochs=30, tol=1e-4, verbose=False)
        user = int(users[row_inx][-1])
        consumed_items = np.argwhere(train_matrix[user]>=1).T[0]
        recommendation = new_P[user].dot(Q.T)
        recommendation[consumed_items] = -np.inf
 
        true_consumption = targets[row_inx]
        mrrs.append(mrr_at_k(recommendation,true_consumption,k=20))
        recalls.append(recall_at_k(recommendation,true_consumption,k=20))
        
    mrr, h_mrr = mean_confidence_interval(mrrs)
    recall, h_recall = mean_confidence_interval(recalls)
    
    return (mrr, h_mrr),(recall, h_recall)
    
@jit(nopython=True, nogil=True)
def recall_at_k(recommendation,true_consumption,k=20):
    topk_inds = recommendation.argsort()[-k:][::-1]
    reccommnded_topk_items = np.zeros(recommendation.shape)
    reccommnded_topk_items[topk_inds] = 1
    
    recall = reccommnded_topk_items[int(true_consumption)]
    return recall

@jit(nopython=True, nogil=True)
def mrr_at_k(recommendation,true_consumption,k=20):
    topk_inds = recommendation.argsort()[-k:][::-1]
    rr = np.zeros(recommendation.shape)
    rr[topk_inds] = 1/np.arange(1,k+1)    
    current_rr = rr[int(true_consumption)]
    return current_rr



    

### MovieLens

In [15]:
import scipy as sp
train_matrix = np.array(sp.sparse.csr_matrix((np.ones(len(ml_train_user_idx)),
                               (ml_train_user_idx, ml_train_item_idx)),
                                shape=(max(ml_train_user_idx)+1,max(ml_train_item_idx)+1), dtype=np.float64).todense())


In [16]:
# train_feedback[:] = 1
P, Q, _ = basic_matrix_factorization(ml_train_user_idx, ml_train_item_idx, ml_train_feedback\
                                     ,rank=20,reg = 0.01,num_epochs=30)

Epoch 1 RMSE: 1.1889509722951297
Epoch 2 RMSE: 0.9005314957384513
Epoch 3 RMSE: 0.8463680763042712
Epoch 4 RMSE: 0.8188051422034335
Epoch 5 RMSE: 0.8004663158387065
Epoch 6 RMSE: 0.7855627864243409
Epoch 7 RMSE: 0.7732885934551574
Epoch 8 RMSE: 0.7632023403587169
Epoch 9 RMSE: 0.7548784181251089
Epoch 10 RMSE: 0.7479567842743502
Epoch 11 RMSE: 0.742152630887336
Epoch 12 RMSE: 0.7372427424804076
Epoch 13 RMSE: 0.7330523912264771
Epoch 14 RMSE: 0.7294447595737159
Epoch 15 RMSE: 0.7263125637673405
Epoch 16 RMSE: 0.7235714286511026
Epoch 17 RMSE: 0.7211546768303492
Epoch 18 RMSE: 0.7190092797572853
Epoch 19 RMSE: 0.7170927493456211
Epoch 20 RMSE: 0.7153707702341925
Epoch 21 RMSE: 0.7138154014210378
Epoch 22 RMSE: 0.7124037087144436
Epoch 23 RMSE: 0.7111167205435434
Epoch 24 RMSE: 0.7099386259592084
Epoch 25 RMSE: 0.708856154402729
Epoch 26 RMSE: 0.7078580925338287
Epoch 27 RMSE: 0.706934905051986
Epoch 28 RMSE: 0.7060784349627898
Epoch 29 RMSE: 0.7052816649704758
Epoch 30 RMSE: 0.704538526

In [12]:
(mrr, h_mrr),(recall, h_recall), = estimate_model(P,Q,ml_test_items, ml_test_mask, ml_test_users,reg = 0.01)
ds_name = "MovieLens"
print("MRR@20 score for MF on ", ds_name,": ",mrr,"±",h_mrr)
print("Recall@20 score for MF on",ds_name,": ",recall,"±",h_recall)

MRR@20 score for MF on  MovieLens :  0.004567202376613807 ± 0.001339522261853232
Recall@20 score for MF on MovieLens :  0.019013581129378128 ± 0.005759417598196718


### LastFM

In [6]:
import scipy as sp
train_matrix = np.array(sp.sparse.csr_matrix((np.ones(len(lf_train_user_idx)),
                               (lf_train_user_idx, lf_train_item_idx)),
                                shape=(max(lf_train_user_idx)+1,max(lf_train_item_idx)+1), dtype=np.float64).todense())



In [7]:
P, Q, _ = basic_matrix_factorization(lf_train_user_idx, lf_train_item_idx, lf_train_feedback\
                                     ,rank=20,reg = 0.01,num_epochs=30)

Epoch 1 RMSE: 0.3340596669149536
Epoch 2 RMSE: 0.13568696368576577
Epoch 3 RMSE: 0.1049218383223705
Epoch 4 RMSE: 0.09057622314985833
Epoch 5 RMSE: 0.08137660911293546
Epoch 6 RMSE: 0.07470498836488523
Epoch 7 RMSE: 0.06951899132109102
Epoch 8 RMSE: 0.06530498687693317
Epoch 9 RMSE: 0.06177382659572118
Epoch 10 RMSE: 0.0587471278538926
Epoch 11 RMSE: 0.05610737578875198
Epoch 12 RMSE: 0.053773248417684535
Epoch 13 RMSE: 0.051686261604353495
Epoch 14 RMSE: 0.049803024791237875
Epoch 15 RMSE: 0.04809050290396788
Epoch 16 RMSE: 0.04652299033682407
Epoch 17 RMSE: 0.045080108428995055
Epoch 18 RMSE: 0.04374543945472753
Epoch 19 RMSE: 0.0425055698978244
Epoch 20 RMSE: 0.041349404662183384
Epoch 21 RMSE: 0.040267665327025186
Epoch 22 RMSE: 0.039252516348822174
Epoch 23 RMSE: 0.03829728207687136
Epoch 24 RMSE: 0.0373962294371177
Epoch 25 RMSE: 0.03654439889919639
Epoch 26 RMSE: 0.035737471478104015
Epoch 27 RMSE: 0.03497166299194902
Epoch 28 RMSE: 0.03424363918652938
Epoch 29 RMSE: 0.033550447

In [8]:
(mrr, h_mrr),(recall, h_recall), = estimate_model(P,Q,lf_test_items, lf_test_mask, lf_test_users,reg = 0.01)
ds_name = "LastFM"
print("MRR@20 score for MF on ", ds_name,": ",mrr,"±",h_mrr)
print("Recall@20 score for MF on",ds_name,": ",recall,"±",h_recall)

MRR@20 score for MF on  LastFM :  1.7397007492385337e-05 ± 2.091678324531542e-05
Recall@20 score for MF on LastFM :  0.00018341892883345562 ± 0.00027892912945554624


In [3]:
ml_data = get_movielens_data("ml-10m.zip", include_time=True)
ml_data.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [10]:
data = (filter_sessions_by_length(ml_data, min_session_length=20)
        #.query('rating >= 4')
        #.assign(rating=1)
       )

In [11]:
data_model = RecommenderData(data, 'userid', 'movieid', 'rating', custom_order='timestamp', seed=0)
data_model.holdout_size = 1
data_model.random_holdout = False
data_model.warm_start = False
data_model.permute_tops = False

In [12]:
data_model.prepare()

Preparing data...
Done.


In [13]:
idx, val, shp = data_model.to_coo()

In [14]:
P, Q, biases = basic_matrix_factorization(*idx.T, val,rank=20,num_epochs=10)

Epoch 1 RMSE: 1.3085293049789994
Epoch 2 RMSE: 0.9151132113832263
Epoch 3 RMSE: 0.8808651366877939
Epoch 4 RMSE: 0.8676692047213754
Epoch 5 RMSE: 0.8587222170703273
Epoch 6 RMSE: 0.8525469762850404
Epoch 7 RMSE: 0.8479598409933002
Epoch 8 RMSE: 0.8442665480920708
Epoch 9 RMSE: 0.8411850669634959
Epoch 10 RMSE: 0.8385955015689102


In [20]:
R = Q.dot(P.T).T

In [50]:
topk = np.argsort(R,axis = 1)[:,-20:]

(69878, 20)

In [60]:
user_idx, item_idx, fdbk_val = data_model.test_to_coo()

In [75]:
SP.coo_matrix(fdbk_val,(user_idx,item_idx),shape =(user_idx.max()+1,item_idx.max()+1) ).todense().shape

TypeError: __init__() got multiple values for argument 'shape'

In [73]:
item_idx

array([ 108, 1557, 1564, ..., 1365,  775, 2300])

In [None]:
from polara import RecommenderData
from polara import SVDModel
from polara import get_movielens_data
from polara.tools.preprocessing import filter_sessions_by_length
from polara.evaluation import evaluation_engine as ee
import numpy as np
import scipy.sparse as SP
from io import BytesIO
import pandas as pd

def train_MF():
    ml_data = get_movielens_data("ml-10m.zip", include_time=True)
    data = (filter_sessions_by_length(ml_data, min_session_length=20)
        #.query('rating >= 4')
        #.assign(rating=1)
       )
    data_model = RecommenderData(data, 'userid', 'movieid', 'rating', custom_order='timestamp', seed=0)
    #data_model.holdout_size = 1
    data_model.random_holdout = False
    data_model.warm_start = False
    data_model.permute_tops = False
