In [2]:
import sys
import time
import heapq
import math
import gc
import numpy as np
import pandas as pd
from collections import defaultdict
import surprise as sp

In [3]:
#Evaluation of metrics
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0
#dataset
class DataSet_1M(object):
    def __init__(self, negNum=2):
        self.trainList, self.shape = self._getTrainData()
        self.trainDict = self._getTrainDict()
        self.trainMat = self._getTrainMatrix()
        self.trainset = self._getInstances(negNum)#sample negative samples
        self.testset = self._getTest()
        
    def _getTrainData(self):
        data = []
        filePath = '/data/fjsdata/BNMF/ml-1m.train.rating'
        u = 0
        i = 0
        maxr = 0.0
        with open(filePath, 'r') as f:
            for line in f:
                if line:
                    lines = line[:-1].split("\t")
                    user = int(lines[0])
                    movie = int(lines[1])
                    score = float(lines[2])
                    data.append((user, movie, score))
                    if user > u:u = user
                    if movie > i:i = movie
                    if score > maxr:maxr = score
        self.maxRate = maxr
        print("Loading Success!\n"
                  "Data Info:\n"
                  "\tUser Num: {}\n"
                  "\tItem Num: {}\n"
                  "\tData Size: {}\n"
                  "\tSparsity: {}".format(u+1, i+1, len(data), len(data)/((u+1)*(i+1))))
        return data, [u+1, i+1]

    def _getTrainDict(self):
        dataDict = {}
        for i in self.trainList:
            dataDict[(i[0], i[1])] = i[2]
        return dataDict

    def _getTrainMatrix(self):
        train_matrix = np.zeros([self.shape[0], self.shape[1]], dtype=np.float32)
        for i in self.trainList:
            user = i[0]
            movie = i[1]
            rating = i[2]
            train_matrix[user][movie] = rating
        return np.array(train_matrix)

    def _getInstances(self, negNum):
        trainset = []
        for i in self.trainList:
            trainset.append([i[0],i[1],i[2]])
            for t in range(negNum):
                j = np.random.randint(self.shape[1])
                while (i[0], j) in self.trainDict:
                    j = np.random.randint(self.shape[1])
                trainset.append([i[0],j,0.0])
        print ('The length of Trainset: %d'%(len(trainset)))
        return trainset

    def _getTest(self):
        #loading data
        testset = []
        filePath = '/data/fjsdata/BNMF/ml-1m.test.negative'
        with open(filePath, 'r') as fd:
            line = fd.readline()
            while line != None and line != '':
                arr = line.split('\t')
                u = eval(arr[0])[0]
                testset.append([u, eval(arr[0])[1], 1.0])#first is one postive item
                for i in arr[1:]:
                    testset.append([u, int(i), 0.0]) #99 negative items
                line = fd.readline()
        print ('The length of Testset: %d'%(len(testset)))
        return testset

In [6]:
for nn in [0,1,2,3,4,5]:
    ds1m = DataSet_1M(negNum=nn)
    trainset = ds1m.trainset
    testset = ds1m.testset
    reader = sp.Reader(rating_scale=(0, 5))
    spdata = sp.Dataset.load_from_df(pd.DataFrame(trainset),reader)
    trainset_svd = spdata.build_full_trainset()
    for k in [5,10,15,20]:#iterations epoches
        algo = sp.SVD(n_factors=k, n_epochs=20, lr_all=0.001, reg_all=0.01 )#
        tstart = time.time()
        algo.fit(trainset_svd)
        elapsed = time.time() - tstart    
        print('Completed training the SVD model in %d seconds' % int(elapsed))
        predictions = algo.test(testset)#testset include one positive and 99 negtive sample of every user.
        user_iid_true_est = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            user_iid_true_est[uid].append((iid, true_r, est))
        hits = []
        ndcgs = []
        for uid, iid_ratings in user_iid_true_est.items():
            # Sort user ratings by estimated value
            #iid_ratings.sort(key=lambda x: x[2], reverse=True) #sorted by est
            scorelist = []
            positem = -1
            for iid, ture_r, est in iid_ratings:
                if positem == -1: positem=iid #one positive item in first
                scorelist.append([iid,est])
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hr = getHitRatio(ranklist, positem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, positem)
            ndcgs.append(ndcg)
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        print ("HR@%d@%d=%.6f, NDCG@%d@%d=%.6f" % (k, nn, hitratio, k, nn, ndcg))

Loading Success!
Data Info:
	User Num: 6040
	Item Num: 3706
	Data Size: 994169
	Sparsity: 0.04441379291858915
The length of Trainset: 994169
The length of Testset: 604000
Completed training the SVD model in 21 seconds
HR@5@0=0.258444, NDCG@5@0=0.126514
Completed training the SVD model in 24 seconds
HR@10@0=0.258278, NDCG@10@0=0.126331
Completed training the SVD model in 28 seconds
HR@15@0=0.259106, NDCG@15@0=0.127861
Completed training the SVD model in 31 seconds
HR@20@0=0.256291, NDCG@20@0=0.126698
Loading Success!
Data Info:
	User Num: 6040
	Item Num: 3706
	Data Size: 994169
	Sparsity: 0.04441379291858915
The length of Trainset: 1988338
The length of Testset: 604000
Completed training the SVD model in 42 seconds
HR@5@1=0.450662, NDCG@5@1=0.247204
Completed training the SVD model in 49 seconds
HR@10@1=0.455629, NDCG@10@1=0.252744
Completed training the SVD model in 57 seconds
HR@15@1=0.461589, NDCG@15@1=0.253892
Completed training the SVD model in 63 seconds
HR@20@1=0.467550, NDCG@20@

In [8]:
class BayesianMatrixFactorization():
    """
    Bayesian Matrix Factorization model
    R = PxQ
    p ~ N(p|0, alpha^(-1)I)
    q ~ N(q|0, alpha^(-1)I)
    r = p @ q
    t ~ N(r|p @ q, beta^(-1))
    """

    def __init__(self, alpha_p:float=1., alpha_q:float=1., beta:float=1.):
        """
        ----------
        n_u, n_i: the number of users and items, respectively.
        k : the number of latent factors
        """
        self.alpha_p = alpha_p
        self.alpha_q = alpha_q
        self.beta = beta
        #posterior of p,q 
        self.pos_mean_p = None
        self.pos_precision_p = None
        self.pos_mean_q = None
        self.pos_precision_q = None

    def fit(self, R:np.ndarray, k:int=5):
        """
        bayesian update of parameters given training dataset
        Parameters
        ----------
        R : (u,i) np.ndarray
            training data independent variable, u is the number of users, i is the number of items.
        k : int, the number of latent factors.
        """
        #1. generate matrices P, Q
        P = np.random.normal(0,self.alpha_p,(R.shape[0],k))#uxk
        Q = np.random.normal(0,self.alpha_q,(R.shape[1],k))#ixk
        #2.calculate the posterior with analytical solution
        self.pos_precision_p = self.alpha_p + self.beta * Q @ Q.T # ixi
        self.pos_mean_p = self.beta * R @ np.linalg.inv(self.pos_precision_p) @ Q # uxi,ixi,ixk -> uxk
        self.pos_precision_q = self.alpha_q + self.beta * P @ P.T # uxu
        self.pos_mean_q = self.beta * R.T @ np.linalg.inv(self.pos_precision_q) @ P # ixu,uxu,uxk -> ixk

    def predict(self, sample_size:int=None):
        """
        return mean  of predictive distribution
        Parameters
        ----------
        sample_size : int, optional
            number of samples to draw from the predictive distribution
            (the default is None, no sampling from the distribution)
        Returns
        -------
        R_pred : (u,i) np.ndarray
            mean of the predictive distribution
        R_pred_sample : (u,i,sample_size) np.ndarray
            samples from the predictive distribution
        """
        if sample_size is not None:
            p_sample = np.random.multivariate_normal(self.pos_mean_p, np.linalg.inv(self.pos_precision_q), size=sample_size)
            q_sample = np.random.multivariate_normal(self.pos_mean_q, np.linalg.inv(self.pos_precision_p), size=sample_size)
            R_pred_sample_list=[]
            for i in range(sample_size): 
                R_pred_sample_list.append( np.dot(p_sample, q_sample.T) )
            R_pred_sample = np.mean(np.array(R_pred_sample_list), axis=0)
            return  R_pred_sample #uxi
        
        R_pred = self.pos_mean_p @ self.pos_mean_q.T #R = PxQ
        return R_pred #uxi
    
for nn in [0,1,2,3,4,5]:
    ds1m = DataSet_1M(negNum=nn)
    trainMat = ds1m.trainMat
    testset = ds1m.testset
    shape = ds1m.shape
    for k in [5,10,15,20]:#iterations epoches
        bmf = BayesianMatrixFactorization()
        tstart = time.time()
        bmf.fit(R=trainMat, k=k)
        elapsed = time.time() - tstart    
        print('Completed training the BMF model in %d seconds' % int(elapsed))
        R_pred = bmf.predict()
        hits = []
        ndcgs = []
        for c in range(0,shape[0]):#6040
            scorelist = []
            gtItem = -1
            for u,i,r in testset[c*100:(c+1)*100]:#604000
                if r == 1.0: gtItem = i
                est = R_pred[int(u)][int(i)]
                scorelist.append([i,est])
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#topn=10
            hr = getHitRatio(ranklist, gtItem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, gtItem)
            ndcgs.append(ndcg)
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        print ("HR@%d@%d=%.6f, NDCG@%d@%d=%.6f" % (k, nn, hitratio, k, nn, ndcg))

Loading Success!
Data Info:
	User Num: 6040
	Item Num: 3706
	Data Size: 994169
	Sparsity: 0.04441379291858915
The length of Trainset: 994169
The length of Testset: 604000
Completed training the BMF model in 2 seconds
HR@5@0=0.217881, NDCG@5@0=0.115555
Completed training the BMF model in 2 seconds
HR@10@0=0.201821, NDCG@10@0=0.105868
Completed training the BMF model in 2 seconds
HR@15@0=0.242715, NDCG@15@0=0.130422
Completed training the BMF model in 2 seconds
HR@20@0=0.252152, NDCG@20@0=0.131339
Loading Success!
Data Info:
	User Num: 6040
	Item Num: 3706
	Data Size: 994169
	Sparsity: 0.04441379291858915
The length of Trainset: 1988338
The length of Testset: 604000
Completed training the BMF model in 2 seconds
HR@5@1=0.237748, NDCG@5@1=0.125895
Completed training the BMF model in 2 seconds
HR@10@1=0.247351, NDCG@10@1=0.135720
Completed training the BMF model in 2 seconds
HR@15@1=0.233609, NDCG@15@1=0.122714
Completed training the BMF model in 2 seconds
HR@20@1=0.224172, NDCG@20@1=0.1192

In [7]:
import sys
import time
import heapq
import math
import gc
import numpy as np
import pandas as pd
from collections import defaultdict
import surprise as sp

In [24]:
#trainset
filePath = "/data/fjsdata/BNMF/ml-1m.train.rating" 
data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
#data['rating']=data['rating'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
maxu, maxi = data['user'].max()+1, data['item'].max()+1
print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
trainset = data.values.tolist()
trainMat = np.zeros([maxu, maxi], dtype=np.float32)
for u,i,r in trainset:
    trainMat[int(u)][int(i)] = float(r)
#testset
testset = []
filePath = "/data/fjsdata/BNMF/ml-1m.test.negative" 
with open(filePath, 'r') as fd:
    line = fd.readline()
    while line != None and line != '':
        arr = line.split('\t')
        u = eval(arr[0])[0]
        testset.append([u, eval(arr[0])[1], 1.0])#first is one postive item
        for i in arr[1:]:
            testset.append([u, int(i), 0.0]) #99 negative items
        line = fd.readline()
print ('The length of Testset: %d'%(len(testset)))

Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444
The length of Testset: 604000


In [25]:
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

In [27]:
reader = sp.Reader(rating_scale=(0, 5))
spdata = sp.Dataset.load_from_df(data,reader)
trainset_svd = spdata.build_full_trainset()
for K in [5,10,15,20]:#iterations epoches
    algo = sp.SVD(n_factors=K, n_epochs=20, lr_all=0.001, reg_all=0.01 )#
    algo.fit(trainset_svd)
    predictions = algo.test(testset)#testset include one positive and 99 negtive sample of every user.
    user_iid_true_est = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_iid_true_est[uid].append((iid, true_r, est))
    hits = []
    ndcgs = []
    for uid, iid_ratings in user_iid_true_est.items():
        # Sort user ratings by estimated value
        #iid_ratings.sort(key=lambda x: x[2], reverse=True) #sorted by est
        scorelist = []
        positem = -1
        for iid, ture_r, est in iid_ratings:
            if positem == -1: positem=iid #one positive item in first
            scorelist.append([iid,est])
        map_item_score = {}
        for item, rate in scorelist: #turn dict
            map_item_score[item] = rate
        ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
        hr = getHitRatio(ranklist, positem)
        hits.append(hr)
        ndcg = getNDCG(ranklist, positem)
        ndcgs.append(ndcg)
    hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
    print ("HR@%d=%.6f, NDCG@%d=%.6f" % (K, hitratio, K, ndcg))

HR@5=0.259272, NDCG@5=0.127297
HR@10=0.258609, NDCG@10=0.125978
HR@15=0.259106, NDCG@15=0.126195
HR@20=0.258775, NDCG@20=0.126813


In [31]:
class BayesianMatrixFactorization():
    """
    Bayesian Matrix Factorization model
    R = PxQ
    p ~ N(p|0, alpha^(-1)I)
    q ~ N(q|0, alpha^(-1)I)
    r = p @ q
    t ~ N(r|p @ q, beta^(-1))
    """

    def __init__(self, alpha_p:float=1., alpha_q:float=1., beta:float=1.):
        """
        ----------
        n_u, n_i: the number of users and items, respectively.
        k : the number of latent factors
        """
        self.alpha_p = alpha_p
        self.alpha_q = alpha_q
        self.beta = beta
        #posterior of p,q 
        self.pos_mean_p = None
        self.pos_precision_p = None
        self.pos_mean_q = None
        self.pos_precision_q = None

    def fit(self, R:np.ndarray, k:int=5):
        """
        bayesian update of parameters given training dataset
        Parameters
        ----------
        R : (u,i) np.ndarray
            training data independent variable, u is the number of users, i is the number of items.
        k : int, the number of latent factors.
        """
        #1. generate matrices P, Q
        P = np.random.normal(0,self.alpha_p,(R.shape[0],k))#uxk
        Q = np.random.normal(0,self.alpha_q,(R.shape[1],k))#ixk
        #2.calculate the posterior with analytical solution
        self.pos_precision_p = self.alpha_p + self.beta * Q @ Q.T # ixi
        self.pos_mean_p = self.beta * R @ np.linalg.inv(self.pos_precision_p) @ Q # uxi,ixi,ixk -> uxk
        self.pos_precision_q = self.alpha_q + self.beta * P @ P.T # uxu
        self.pos_mean_q = self.beta * R.T @ np.linalg.inv(self.pos_precision_q) @ P # ixu,uxu,uxk -> ixk

    def predict(self, sample_size:int=None):
        """
        return mean  of predictive distribution
        Parameters
        ----------
        sample_size : int, optional
            number of samples to draw from the predictive distribution
            (the default is None, no sampling from the distribution)
        Returns
        -------
        R_pred : (u,i) np.ndarray
            mean of the predictive distribution
        R_pred_sample : (u,i,sample_size) np.ndarray
            samples from the predictive distribution
        """
        if sample_size is not None:
            p_sample = np.random.multivariate_normal(self.pos_mean_p, np.linalg.inv(self.pos_precision_q), size=sample_size)
            q_sample = np.random.multivariate_normal(self.pos_mean_q, np.linalg.inv(self.pos_precision_p), size=sample_size)
            R_pred_sample_list=[]
            for i in range(sample_size): 
                R_pred_sample_list.append( np.dot(p_sample, q_sample.T) )
            R_pred_sample = np.mean(np.array(R_pred_sample_list), axis=0)
            return  R_pred_sample #uxi
        
        R_pred = self.pos_mean_p @ self.pos_mean_q.T #R = PxQ
        return R_pred #uxi


for K in [5,10,15,20]:#iterations epoches
    bmf = BayesianMatrixFactorization()
    bmf.fit(R=trainMat, k=K)
    R_pred = bmf.predict()
    hits = []
    ndcgs = []
    for c in range(0,maxu):#6040
        scorelist = []
        gtItem = -1
        for u,i,r in testset[c*100:(c+1)*100]:#604000
            if r == 1.0: gtItem = i
            est = R_pred[int(u)][int(i)]
            scorelist.append([i,est])
        map_item_score = {}
        for item, rate in scorelist: #turn dict
            map_item_score[item] = rate
        ranklist = heapq.nlargest(30, map_item_score, key=map_item_score.get)#topn=20
        hr = getHitRatio(ranklist, gtItem)
        hits.append(hr)
        ndcg = getNDCG(ranklist, gtItem)
        ndcgs.append(ndcg)
    hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
    print ("HR@%d=%.6f, NDCG@%d=%.6f" % (K, hitratio, K, ndcg))

HR@5=0.424338, NDCG@5=0.166753
HR@10=0.409934, NDCG@10=0.160615
HR@15=0.424172, NDCG@15=0.173028
HR@20=0.413576, NDCG@20=0.162718
