In [1]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.08.02
@function: baseline: SVDBias
           paper: https://dl.acm.org/citation.cfm?id=1401944
           Datatset: MovieLens-1m:https://grouplens.org/datasets/movielens/  
           Evaluation: HR@10 NDCG@10
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self):
        self.trainset, self.testset, self.maxu, self.maxi = self._getDataset_as_list()
        
    def _getDataset_as_list(self):
        #trainset
        filePath = "/data/fjsdata/BNMF/ml-1m.train.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        data['rating']=data['rating'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
        maxu, maxi = data['user'].max()+1, data['item'].max()+1
        print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
        trainset = data.values.tolist()
        #testset
        testset = []
        filePath = "/data/fjsdata/BNMF/ml-1m.test.negative" 
        with open(filePath, 'r') as fd:
            line = fd.readline()
            while line != None and line != '':
                arr = line.split('\t')
                u = eval(arr[0])[0]
                testset.append([u, eval(arr[0])[1], 1.0])#first is one postive item
                for i in arr[1:]:
                    testset.append([u, int(i), 0.0]) #99 negative items
                line = fd.readline()
        return trainset, testset, maxu, maxi 
    
    def list_to_matrix(self, dataset, maxu, maxi):              
        dataMat = np.zeros([maxu, maxi], dtype=np.float32)
        for u,i,r in dataset:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
    def list_to_dict(self, dataset):
        dataDict = {}
        for u,i,r in dataset:
            dataDict[int(u), int(i)] = float(r)
        return dataDict
    
    def getHitRatio(self, ranklist, targetItem):
        for item in ranklist:
            if item == targetItem:
                return 1
        return 0
    def getNDCG(self, ranklist, targetItem):
        for i in range(len(ranklist)):
            item = ranklist[i]
            if item == targetItem:
                return math.log(2) / math.log(i+2)
        return 0

class SVDBias():
    
    def __init__(self, R, num_ng):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        - num_ng (int)  : number of negative items
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.num_ng = num_ng
        
        # Create a list of training samples
        pos_samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        #smapling the negative items
        neg_samples = random.sample([
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] == 0
        ], len(pos_samples)*num_ng)
        
        self.samples = pos_samples + neg_samples
        
    def train(self, K, alpha=0.001, beta=0.01, epochs=20):
        '''
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        - K (int)       : number of latent dimensions
        -epochs(int)    : number of iterations
        '''
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.epochs = epochs
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
               
        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.epochs):
            np.random.shuffle(self.samples)
            self.sgd()
            if (i+1) % 10 == 0:
                mse = self.mse()
                print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)
    
if __name__ == "__main__": 
    ds = DataSet()#loading dataset
    svdb = SVDBias(R=ds.list_to_matrix(ds.trainset,ds.maxu,ds.maxi), num_ng=2)#samples ratios is pos:neg = 1:2
    for K in [8,16,32,64]:#latent factors
        nR = svdb.train(K=K, alpha=0.001, beta=0.01, epochs=50)
        hrs = []
        ndcgs = []
        num_batches = len(ds.testset) // 100
        for i in range(num_batches):
            test_batch = ds.testset[i*100: (i+1)*100]
            targetItem = int(test_batch[0][1])#positive item
            map_item_score = {}
            for j in range(100):
                tu ,ti = int(test_batch[j][0]), int(test_batch[j][1])
                map_item_score[ti] = nR[tu,ti]
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hrs.append(ds.getHitRatio(ranklist, targetItem))
            ndcgs.append(ds.getNDCG(ranklist, targetItem))
        hr, ndcg = np.array(hrs).mean(), np.array(ndcgs).mean()
        print("HR@10:{}, NDCG@10:{}, K:{}".format(hr, ndcg, K))



Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444
Iteration: 10 ; error = 437.6707
Iteration: 20 ; error = 436.3256
Iteration: 30 ; error = 435.6824
Iteration: 40 ; error = 434.7616
Iteration: 50 ; error = 433.1408
HR@10:0.46655629139072846, NDCG@10:0.26113775091566355, K:8
Iteration: 10 ; error = 437.1201
Iteration: 20 ; error = 436.3696
Iteration: 30 ; error = 435.8847
Iteration: 40 ; error = 435.1103
Iteration: 50 ; error = 433.1086
HR@10:0.46572847682119206, NDCG@10:0.2616682762344062, K:16
Iteration: 10 ; error = 437.1657
Iteration: 20 ; error = 436.7812
Iteration: 30 ; error = 436.3101
Iteration: 40 ; error = 435.7479
Iteration: 50 ; error = 434.3493
HR@10:0.46572847682119206, NDCG@10:0.2624086717011347, K:32
Iteration: 10 ; error = 437.0290
Iteration: 20 ; error = 436.8687
Iteration: 30 ; error = 436.7618
Iteration: 40 ; error = 436.5100
Iteration: 50 ; error = 435.9069
HR@10:0.46208609271523177, NDCG@10:0.26079258349727474, K:64


In [14]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.08.02
@function: baseline: SVDBias
           paper: https://dl.acm.org/citation.cfm?id=1401944
           Datatset: MovieLens-1m:https://grouplens.org/datasets/movielens/  
           Evaluation: HR@10 NDCG@10
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self):
        self.trainset, self.testset, self.maxu, self.maxi = self._getDataset_as_list()
        
    def _getDataset_as_list(self):
        #trainset
        filePath = "/data/fjsdata/BNMF/ml-1m.train.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        data['rating']=data['rating'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
        maxu, maxi = data['user'].max()+1, data['item'].max()+1
        print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
        trainset = data.values.tolist()
        #testset
        testset = []
        filePath = "/data/fjsdata/BNMF/ml-1m.test.negative" 
        with open(filePath, 'r') as fd:
            line = fd.readline()
            while line != None and line != '':
                arr = line.split('\t')
                u = eval(arr[0])[0]
                testset.append([u, eval(arr[0])[1], 1.0])#first is one postive item
                for i in arr[1:]:
                    testset.append([u, int(i), 0.0]) #99 negative items
                line = fd.readline()
        return trainset, testset, maxu, maxi 
    
    def list_to_matrix(self, dataset, maxu, maxi):              
        dataMat = np.zeros([maxu, maxi], dtype=np.float32)
        for u,i,r in dataset:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
    def list_to_dict(self, dataset):
        dataDict = {}
        for u,i,r in dataset:
            dataDict[int(u), int(i)] = float(r)
        return dataDict
    
    def getHitRatio(self, ranklist, targetItem):
        for item in ranklist:
            if item == targetItem:
                return 1
        return 0
    def getNDCG(self, ranklist, targetItem):
        for i in range(len(ranklist)):
            item = ranklist[i]
            if item == targetItem:
                return math.log(2) / math.log(i+2)
        return 0

class SVDBias():
    
    def __init__(self, R, num_ng):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        - num_ng (int)  : number of negative items
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.num_ng = num_ng
        
        # Create a list of training samples
        pos_samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        #smapling the negative items
        neg_samples = random.sample([
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] == 0
        ], len(pos_samples)*num_ng)
        
        self.samples = pos_samples + neg_samples
        
    def train(self, K, alpha=0.001, beta=0.01, epochs=20):
        '''
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        - K (int)       : number of latent dimensions
        -epochs(int)    : number of iterations
        '''
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.epochs = epochs
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
               
        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.epochs):
            np.random.shuffle(self.samples)
            self.sgd()
            if (i+1) % 10 == 0:
                mse = self.mse()
                print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)
    
if __name__ == "__main__": 
    ds = DataSet()#loading dataset
    svdb = SVDBias(R=ds.list_to_matrix(ds.trainset,ds.maxu,ds.maxi), num_ng=2)#samples ratios is pos:neg = 1:2
    for K in [8,16,32,64]:#latent factors
        nR = svdb.train(K=K, alpha=0.001, beta=0.01, epochs=50)
        hrs = []
        ndcgs = []
        num_batches = len(ds.testset) // 100
        for i in range(num_batches):
            test_batch = ds.testset[i*100: (i+1)*100]
            targetItem = int(test_batch[0][1])#positive item
            map_item_score = {}
            for j in range(100):
                tu ,ti = int(test_batch[j][0]), int(test_batch[j][1])
                map_item_score[ti] = nR[tu,ti]
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hrs.append(ds.getHitRatio(ranklist, targetItem))
            ndcgs.append(ds.getNDCG(ranklist, targetItem))
        hr, ndcg = np.array(hrs).mean(), np.array(ndcgs).mean()
        print("HR@10:{}, NDCG@10:{}, K:{}".format(hr, ndcg, K))

Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444
HR@10:0.08874172185430464, NDCG@10:0.0408303303215734, K:8
HR@10:0.0945364238410596, NDCG@10:0.042871776899174326, K:16
HR@10:0.10066225165562914, NDCG@10:0.044435626680108996, K:32
HR@10:0.10546357615894039, NDCG@10:0.047462935540000334, K:64
