In [5]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.18
@function: Implementing PMF
           Dataset: Knowledage-CC
           Evaluating: hitradio,ndcg
           https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf
           Matlab: http://www.utstat.toronto.edu/~rsalakhu/BPMF.html 
@reference: https://github.com/adamzjw/Probabilistic-matrix-factorization-in-Python
'''
import numpy as np
import pandas as pd
from numpy.random import RandomState
import copy
import heapq
import math
from numpy import linalg as LA
import random
#define class PMF
class PMF:
    def __init__(self, num_feat=8, epsilon=1, _lambda=0.1, momentum=0.8, maxepoch=20, num_batches=10, batch_size=1000):
        self.num_feat = num_feat
        self.epsilon = epsilon
        self._lambda = _lambda
        self.momentum = momentum
        self.maxepoch = maxepoch
        self.num_batches = num_batches
        self.batch_size = batch_size
        
        self.w_C = None
        self.w_I = None

        self.err_train = []
        self.err_val = []
        
    def fit(self, train_vec, val_vec):   
        # mean subtraction
        self.mean_inv = np.mean(train_vec[:,2])
        
        pairs_tr = train_vec.shape[0]
        pairs_va = val_vec.shape[0]
        
        # 1-p-i, 2-m-c
        num_inv = int(max(np.amax(train_vec[:,0]), np.amax(val_vec[:,0]))) + 1
        num_com = int(max(np.amax(train_vec[:,1]), np.amax(val_vec[:,1]))) + 1

        incremental = False
        if ((not incremental) or (self.w_C is None)):
            # initialize
            self.epoch = 0
            self.w_C = 0.1 * np.random.randn(num_com, self.num_feat)
            self.w_I = 0.1 * np.random.randn(num_inv, self.num_feat)
            
            self.w_C_inc = np.zeros((num_com, self.num_feat))
            self.w_I_inc = np.zeros((num_inv, self.num_feat))
        
        
        while self.epoch < self.maxepoch:
            self.epoch += 1

            # Shuffle training truples
            shuffled_order = np.arange(train_vec.shape[0])
            np.random.shuffle(shuffled_order)

            # Batch update
            for batch in range(self.num_batches):
                # print "epoch %d batch %d" % (self.epoch, batch+1)

                batch_idx = np.mod(np.arange(self.batch_size * batch,
                                             self.batch_size * (batch+1)),
                                   shuffled_order.shape[0])

                batch_invID = np.array(train_vec[shuffled_order[batch_idx], 0], dtype='int32')
                batch_comID = np.array(train_vec[shuffled_order[batch_idx], 1], dtype='int32')

                # Compute Objective Function
                pred_out = np.sum(np.multiply(self.w_I[batch_invID,:], 
                                                self.w_C[batch_comID,:]),
                                axis=1) # mean_inv subtracted

                rawErr = pred_out - train_vec[shuffled_order[batch_idx], 2] + self.mean_inv

                # Compute gradients
                Ix_C = 2 * np.multiply(rawErr[:, np.newaxis], self.w_I[batch_invID,:]) \
                        + self._lambda * self.w_C[batch_comID,:]
                Ix_I = 2 * np.multiply(rawErr[:, np.newaxis], self.w_C[batch_comID,:]) \
                        + self._lambda * self.w_I[batch_invID,:]
            
                dw_C = np.zeros((num_com, self.num_feat))
                dw_I = np.zeros((num_inv, self.num_feat))

                # loop to aggreate the gradients of the same element
                for i in range(self.batch_size):
                    dw_C[batch_comID[i],:] += Ix_C[i,:]
                    dw_I[batch_invID[i],:] += Ix_I[i,:]


                # Update with momentum
                self.w_C_inc = self.momentum * self.w_C_inc + self.epsilon * dw_C / self.batch_size
                self.w_I_inc = self.momentum * self.w_I_inc + self.epsilon * dw_I / self.batch_size


                self.w_C = self.w_C - self.w_C_inc
                self.w_I = self.w_I - self.w_I_inc

                # Compute Objective Function after
                if batch == self.num_batches - 1:
                    pred_out = np.sum(np.multiply(self.w_I[np.array(train_vec[:,0], dtype='int32'),:],
                                                    self.w_C[np.array(train_vec[:,1], dtype='int32'),:]),
                                        axis=1) # mean_inv subtracted
                    rawErr = pred_out - train_vec[:, 2] + self.mean_inv
                    obj = LA.norm(rawErr) ** 2 \
                            + 0.5*self._lambda*(LA.norm(self.w_I) ** 2 + LA.norm(self.w_C) ** 2)

                    self.err_train.append(np.sqrt(obj/pairs_tr))

                # Compute validation error
                if batch == self.num_batches - 1:
                    pred_out = np.sum(np.multiply(self.w_I[np.array(val_vec[:,0], dtype='int32'),:],
                                                    self.w_C[np.array(val_vec[:,1], dtype='int32'),:]),
                                        axis=1) # mean_inv subtracted
                    rawErr = pred_out - val_vec[:, 2] + self.mean_inv
                    self.err_val.append(LA.norm(rawErr)/np.sqrt(pairs_va))

                # Print info
                if batch == self.num_batches - 1:
                    print ('Training RMSE: %f, Val RMSE %f' % (self.err_train[-1], self.err_val[-1]))
    
    def predict(self, invID, comID): 
        return np.dot(self.w_C[comID,:], self.w_I[invID,:]) + self.mean_inv
               
    def evaluate(self, test_vec, k=10):
        def getHitRatio(ranklist, gtItem):
            for item in ranklist:
                if item == gtItem:
                    return 1
            return 0

        def getNDCG(ranklist, gtItem):
            for i in range(len(ranklist)):
                item = ranklist[i]
                if item == gtItem:
                    return math.log(2) / math.log(i+2)
            return 0
        
        testset = pd.DataFrame(test_vec, columns=['u','i'])
        hits = []
        ndcgs = []
        list_csr = list(set(np.array(testset['u']).tolist()))
        for u in list_csr:
            csrset = np.array(testset[testset['u']==u]).tolist()
            scorelist = []
            positem = csrset[0][1]#first item is positive
            for _, i in csrset:
                scorelist.append([i, self.predict(u,i)])
            #get topk 
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hr = getHitRatio(ranklist, positem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, positem)
            ndcgs.append(ndcg)
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hitratio,ndcg
        
#loading dataset
def getTrainset(filePath):
    kbdata = pd.read_csv("/data/fjsdata/ctKngBase/kbcc_trainset.csv", sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
    kbdata['num']=kbdata['num'].apply(lambda x: 1 if float(x)>0.0 else 0)
    trainset = np.array(kbdata).tolist()
    maxu = kbdata['csr'].max()
    maxi = kbdata['ke'].max()
    maxr = kbdata['num'].max()
    return trainset, maxr, maxu, maxi

def getTestset(filePath):
    kbdata = pd.read_csv("/data/fjsdata/ctKngBase/kbcc_testset.csv", sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
    #testset['num']=testset['num'].apply(lambda x: 1 if float(x)>0.0 else 0)
    testset = np.array(kbdata[['csr','ke']]).tolist()
    return testset    

def getTrainDict(data):
        dataDict = {}
        for i in data:
            dataDict[(i[0], i[1])] = i[2]
        return dataDict
    
def getNegTrain(data, maxi, negNum=4):
        datadict = getTrainDict(data)
        trainneg = []
        for i in data:
            trainneg.append([i[0],i[1],i[2]])
            for t in range(negNum):
                j = np.random.randint(maxi)
                while (i[0], j) in datadict:
                    j = np.random.randint(maxi)
                trainneg.append([i[0], j, 0.0])
        return trainneg
    
if __name__ == '__main__':
    trainset, maxr, maxu, maxi = getTrainset("/data/fjsdata/ctKngBase/kbcc_trainset.csv")
    trainneg = getNegTrain(trainset, maxi)
    testset = getTestset("/data/fjsdata/ctKngBase/kbcc_testset.csv")
    print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % (len(trainset), maxu+1, maxi+1, len(trainset)/(maxu*maxr)))

    print ("%3s%20s%20s" % ('K','HR@10', 'NDCG@10'))
    for K in [8, 16, 32, 64]:
        pmf = PMF(num_feat=K)
        valtest = random.sample(trainset,int(0.2*len(trainset)))
        pmf.fit(np.array(trainneg), np.array(valtest))
        hit, ndcg = pmf.evaluate(testset)
        print ("%3d%20.6f%20.6f" % (K, hit, ndcg))

Dataset Statistics: Interaction = 2547452, User = 10216, Item = 96324, Sparsity = 249.3835
  K               HR@10             NDCG@10
Training RMSE: 0.401038, Val RMSE 0.800511
Training RMSE: 0.401035, Val RMSE 0.800509
Training RMSE: 0.401031, Val RMSE 0.800506
Training RMSE: 0.401027, Val RMSE 0.800503
Training RMSE: 0.401024, Val RMSE 0.800501
Training RMSE: 0.401020, Val RMSE 0.800498
Training RMSE: 0.401017, Val RMSE 0.800496
Training RMSE: 0.401013, Val RMSE 0.800493
Training RMSE: 0.401010, Val RMSE 0.800491
Training RMSE: 0.401007, Val RMSE 0.800489
Training RMSE: 0.401003, Val RMSE 0.800486
Training RMSE: 0.401000, Val RMSE 0.800484
Training RMSE: 0.400997, Val RMSE 0.800482
Training RMSE: 0.400993, Val RMSE 0.800479
Training RMSE: 0.400990, Val RMSE 0.800477
Training RMSE: 0.400987, Val RMSE 0.800474
Training RMSE: 0.400983, Val RMSE 0.800472
Training RMSE: 0.400980, Val RMSE 0.800469
Training RMSE: 0.400977, Val RMSE 0.800467
Training RMSE: 0.400973, Val RMSE 0.800464
  8  

In [3]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.18
@function: Implementing PMF
           Dataset: Pinterest-20
           Evaluating: hitradio,ndcg
           https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf
           Matlab: http://www.utstat.toronto.edu/~rsalakhu/BPMF.html 
@reference: https://github.com/adamzjw/Probabilistic-matrix-factorization-in-Python
'''
import numpy as np
import pandas as pd
from numpy.random import RandomState
import copy
import heapq
import math
from numpy import linalg as LA
import random
#define class PMF
class PMF:
    def __init__(self, num_feat=8, epsilon=1, _lambda=0.1, momentum=0.8, maxepoch=20, num_batches=10, batch_size=1000):
        self.num_feat = num_feat
        self.epsilon = epsilon
        self._lambda = _lambda
        self.momentum = momentum
        self.maxepoch = maxepoch
        self.num_batches = num_batches
        self.batch_size = batch_size
        
        self.w_C = None
        self.w_I = None

        self.err_train = []
        self.err_val = []
        
    def fit(self, train_vec, val_vec):   
        # mean subtraction
        self.mean_inv = np.mean(train_vec[:,2])
        
        pairs_tr = train_vec.shape[0]
        pairs_va = val_vec.shape[0]
        
        # 1-p-i, 2-m-c
        num_inv = int(max(np.amax(train_vec[:,0]), np.amax(val_vec[:,0]))) + 1
        num_com = int(max(np.amax(train_vec[:,1]), np.amax(val_vec[:,1]))) + 1

        incremental = False
        if ((not incremental) or (self.w_C is None)):
            # initialize
            self.epoch = 0
            self.w_C = 0.1 * np.random.randn(num_com, self.num_feat)
            self.w_I = 0.1 * np.random.randn(num_inv, self.num_feat)
            
            self.w_C_inc = np.zeros((num_com, self.num_feat))
            self.w_I_inc = np.zeros((num_inv, self.num_feat))
        
        
        while self.epoch < self.maxepoch:
            self.epoch += 1

            # Shuffle training truples
            shuffled_order = np.arange(train_vec.shape[0])
            np.random.shuffle(shuffled_order)

            # Batch update
            for batch in range(self.num_batches):
                # print "epoch %d batch %d" % (self.epoch, batch+1)

                batch_idx = np.mod(np.arange(self.batch_size * batch,
                                             self.batch_size * (batch+1)),
                                   shuffled_order.shape[0])

                batch_invID = np.array(train_vec[shuffled_order[batch_idx], 0], dtype='int32')
                batch_comID = np.array(train_vec[shuffled_order[batch_idx], 1], dtype='int32')

                # Compute Objective Function
                pred_out = np.sum(np.multiply(self.w_I[batch_invID,:], 
                                                self.w_C[batch_comID,:]),
                                axis=1) # mean_inv subtracted

                rawErr = pred_out - train_vec[shuffled_order[batch_idx], 2] + self.mean_inv

                # Compute gradients
                Ix_C = 2 * np.multiply(rawErr[:, np.newaxis], self.w_I[batch_invID,:]) \
                        + self._lambda * self.w_C[batch_comID,:]
                Ix_I = 2 * np.multiply(rawErr[:, np.newaxis], self.w_C[batch_comID,:]) \
                        + self._lambda * self.w_I[batch_invID,:]
            
                dw_C = np.zeros((num_com, self.num_feat))
                dw_I = np.zeros((num_inv, self.num_feat))

                # loop to aggreate the gradients of the same element
                for i in range(self.batch_size):
                    dw_C[batch_comID[i],:] += Ix_C[i,:]
                    dw_I[batch_invID[i],:] += Ix_I[i,:]


                # Update with momentum
                self.w_C_inc = self.momentum * self.w_C_inc + self.epsilon * dw_C / self.batch_size
                self.w_I_inc = self.momentum * self.w_I_inc + self.epsilon * dw_I / self.batch_size


                self.w_C = self.w_C - self.w_C_inc
                self.w_I = self.w_I - self.w_I_inc

                # Compute Objective Function after
                if batch == self.num_batches - 1:
                    pred_out = np.sum(np.multiply(self.w_I[np.array(train_vec[:,0], dtype='int32'),:],
                                                    self.w_C[np.array(train_vec[:,1], dtype='int32'),:]),
                                        axis=1) # mean_inv subtracted
                    rawErr = pred_out - train_vec[:, 2] + self.mean_inv
                    obj = LA.norm(rawErr) ** 2 \
                            + 0.5*self._lambda*(LA.norm(self.w_I) ** 2 + LA.norm(self.w_C) ** 2)

                    self.err_train.append(np.sqrt(obj/pairs_tr))

                # Compute validation error
                if batch == self.num_batches - 1:
                    pred_out = np.sum(np.multiply(self.w_I[np.array(val_vec[:,0], dtype='int32'),:],
                                                    self.w_C[np.array(val_vec[:,1], dtype='int32'),:]),
                                        axis=1) # mean_inv subtracted
                    rawErr = pred_out - val_vec[:, 2] + self.mean_inv
                    self.err_val.append(LA.norm(rawErr)/np.sqrt(pairs_va))

                # Print info
                if batch == self.num_batches - 1:
                    print ('Training RMSE: %f, Val RMSE %f' % (self.err_train[-1], self.err_val[-1]))
    
    def predict(self, invID, comID): 
        return np.dot(self.w_C[comID,:], self.w_I[invID,:]) + self.mean_inv
               
    def evaluate(self, test_vec, k=10):
        def getHitRatio(ranklist, gtItem):
            for item in ranklist:
                if item == gtItem:
                    return 1
            return 0

        def getNDCG(ranklist, gtItem):
            for i in range(len(ranklist)):
                item = ranklist[i]
                if item == gtItem:
                    return math.log(2) / math.log(i+2)
            return 0
        
        testset = pd.DataFrame(test_vec, columns=['u','i'])
        hits = []
        ndcgs = []
        list_csr = list(set(np.array(testset['u']).tolist()))
        for u in list_csr:
            csrset = np.array(testset[testset['u']==u]).tolist()
            scorelist = []
            positem = csrset[0][1]#first item is positive
            for _, i in csrset:
                scorelist.append([i, self.predict(u,i)])
            #get topk 
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hr = getHitRatio(ranklist, positem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, positem)
            ndcgs.append(ndcg)
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hitratio,ndcg
        
#loading dataset
def getTrainset(filePath):
    trainset = []
    maxu = 0 
    maxi = 0 
    maxr = 0.0
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != "":
            arr = line.split("\t")
            u, i, rating = int(arr[0]), int(arr[1]), float(arr[2])
            trainset.append([int(arr[0]), int(arr[1]), float(arr[2])])
            if rating > maxr: maxr = rating
            if u > maxu: maxu = u
            if i > maxi: maxi = i
            line = fd.readline()
        return trainset, maxr, maxu, maxi

def getTestset(filePath):
    testset = []
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != '':
            arr = line.split('\t')
            u = eval(arr[0])[0]
            testset.append([u, eval(arr[0])[1]])#one postive item
            for i in arr[1:]:
                testset.append([u, int(i)]) #99 negative items
            line = fd.readline()
    return testset    

def getTrainDict(data):
        dataDict = {}
        for i in data:
            dataDict[(i[0], i[1])] = i[2]
        return dataDict
    
def getNegTrain(data, maxi, negNum=4):
        datadict = getTrainDict(data)
        trainneg = []
        for i in data:
            trainneg.append([i[0],i[1],i[2]])
            for t in range(negNum):
                j = np.random.randint(maxi)
                while (i[0], j) in datadict:
                    j = np.random.randint(maxi)
                trainneg.append([i[0], j, 0.0])
        return trainneg
    
if __name__ == '__main__':
    trainset, maxr, maxu, maxi = getTrainset("/data/fjsdata/ctKngBase/ml/pinterest-20.train.rating")
    trainneg = getNegTrain(trainset, maxi)
    testset = getTestset("/data/fjsdata/ctKngBase/ml/pinterest-20.test.negative")
    print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % (len(trainset), maxu+1, maxi+1, len(trainset)/(maxu*maxr)))

    print ("%3s%20s%20s" % ('K','HR@10', 'NDCG@10'))
    for K in [8, 16, 32, 64]:
        pmf = PMF(num_feat=K)
        valtest = random.sample(trainset,int(0.2*len(trainset)))
        pmf.fit(np.array(trainneg), np.array(valtest))
        hit, ndcg = pmf.evaluate(testset)
        print ("%3d%20.6f%20.6f" % (K, hit, ndcg))

Dataset Statistics: Interaction = 1445622, User = 55187, Item = 9916, Sparsity = 26.1954
  K               HR@10             NDCG@10
Training RMSE: 0.401051, Val RMSE 0.800415
Training RMSE: 0.401049, Val RMSE 0.800412
Training RMSE: 0.401047, Val RMSE 0.800410
Training RMSE: 0.401044, Val RMSE 0.800407
Training RMSE: 0.401042, Val RMSE 0.800404
Training RMSE: 0.401040, Val RMSE 0.800401
Training RMSE: 0.401037, Val RMSE 0.800399
Training RMSE: 0.401035, Val RMSE 0.800396
Training RMSE: 0.401032, Val RMSE 0.800393
Training RMSE: 0.401030, Val RMSE 0.800391
Training RMSE: 0.401028, Val RMSE 0.800388
Training RMSE: 0.401025, Val RMSE 0.800386
Training RMSE: 0.401023, Val RMSE 0.800383
Training RMSE: 0.401020, Val RMSE 0.800380
Training RMSE: 0.401018, Val RMSE 0.800378
Training RMSE: 0.401015, Val RMSE 0.800376
Training RMSE: 0.401013, Val RMSE 0.800373
Training RMSE: 0.401011, Val RMSE 0.800371
Training RMSE: 0.401008, Val RMSE 0.800368
Training RMSE: 0.401006, Val RMSE 0.800366
  8    

In [2]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.18
@function: Implementing PMF
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
           https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf
           Matlab: http://www.utstat.toronto.edu/~rsalakhu/BPMF.html 
@reference: https://github.com/adamzjw/Probabilistic-matrix-factorization-in-Python
'''
import numpy as np
import pandas as pd
from numpy.random import RandomState
import copy
import heapq
import math
from numpy import linalg as LA
import random
#define class PMF
class PMF:
    def __init__(self, num_feat=8, epsilon=1, _lambda=0.1, momentum=0.8, maxepoch=20, num_batches=10, batch_size=1000):
        self.num_feat = num_feat
        self.epsilon = epsilon
        self._lambda = _lambda
        self.momentum = momentum
        self.maxepoch = maxepoch
        self.num_batches = num_batches
        self.batch_size = batch_size
        
        self.w_C = None
        self.w_I = None

        self.err_train = []
        self.err_val = []
        
    def fit(self, train_vec, val_vec):   
        # mean subtraction
        self.mean_inv = np.mean(train_vec[:,2])
        
        pairs_tr = train_vec.shape[0]
        pairs_va = val_vec.shape[0]
        
        # 1-p-i, 2-m-c
        num_inv = int(max(np.amax(train_vec[:,0]), np.amax(val_vec[:,0]))) + 1
        num_com = int(max(np.amax(train_vec[:,1]), np.amax(val_vec[:,1]))) + 1

        incremental = False
        if ((not incremental) or (self.w_C is None)):
            # initialize
            self.epoch = 0
            self.w_C = 0.1 * np.random.randn(num_com, self.num_feat)
            self.w_I = 0.1 * np.random.randn(num_inv, self.num_feat)
            
            self.w_C_inc = np.zeros((num_com, self.num_feat))
            self.w_I_inc = np.zeros((num_inv, self.num_feat))
        
        
        while self.epoch < self.maxepoch:
            self.epoch += 1

            # Shuffle training truples
            shuffled_order = np.arange(train_vec.shape[0])
            np.random.shuffle(shuffled_order)

            # Batch update
            for batch in range(self.num_batches):
                # print "epoch %d batch %d" % (self.epoch, batch+1)

                batch_idx = np.mod(np.arange(self.batch_size * batch,
                                             self.batch_size * (batch+1)),
                                   shuffled_order.shape[0])

                batch_invID = np.array(train_vec[shuffled_order[batch_idx], 0], dtype='int32')
                batch_comID = np.array(train_vec[shuffled_order[batch_idx], 1], dtype='int32')

                # Compute Objective Function
                pred_out = np.sum(np.multiply(self.w_I[batch_invID,:], 
                                                self.w_C[batch_comID,:]),
                                axis=1) # mean_inv subtracted

                rawErr = pred_out - train_vec[shuffled_order[batch_idx], 2] + self.mean_inv

                # Compute gradients
                Ix_C = 2 * np.multiply(rawErr[:, np.newaxis], self.w_I[batch_invID,:]) \
                        + self._lambda * self.w_C[batch_comID,:]
                Ix_I = 2 * np.multiply(rawErr[:, np.newaxis], self.w_C[batch_comID,:]) \
                        + self._lambda * self.w_I[batch_invID,:]
            
                dw_C = np.zeros((num_com, self.num_feat))
                dw_I = np.zeros((num_inv, self.num_feat))

                # loop to aggreate the gradients of the same element
                for i in range(self.batch_size):
                    dw_C[batch_comID[i],:] += Ix_C[i,:]
                    dw_I[batch_invID[i],:] += Ix_I[i,:]


                # Update with momentum
                self.w_C_inc = self.momentum * self.w_C_inc + self.epsilon * dw_C / self.batch_size
                self.w_I_inc = self.momentum * self.w_I_inc + self.epsilon * dw_I / self.batch_size


                self.w_C = self.w_C - self.w_C_inc
                self.w_I = self.w_I - self.w_I_inc

                # Compute Objective Function after
                if batch == self.num_batches - 1:
                    pred_out = np.sum(np.multiply(self.w_I[np.array(train_vec[:,0], dtype='int32'),:],
                                                    self.w_C[np.array(train_vec[:,1], dtype='int32'),:]),
                                        axis=1) # mean_inv subtracted
                    rawErr = pred_out - train_vec[:, 2] + self.mean_inv
                    obj = LA.norm(rawErr) ** 2 \
                            + 0.5*self._lambda*(LA.norm(self.w_I) ** 2 + LA.norm(self.w_C) ** 2)

                    self.err_train.append(np.sqrt(obj/pairs_tr))

                # Compute validation error
                if batch == self.num_batches - 1:
                    pred_out = np.sum(np.multiply(self.w_I[np.array(val_vec[:,0], dtype='int32'),:],
                                                    self.w_C[np.array(val_vec[:,1], dtype='int32'),:]),
                                        axis=1) # mean_inv subtracted
                    rawErr = pred_out - val_vec[:, 2] + self.mean_inv
                    self.err_val.append(LA.norm(rawErr)/np.sqrt(pairs_va))

                # Print info
                if batch == self.num_batches - 1:
                    print ('Training RMSE: %f, Val RMSE %f' % (self.err_train[-1], self.err_val[-1]))
    
    def predict(self, invID, comID): 
        return np.dot(self.w_C[comID,:], self.w_I[invID,:]) + self.mean_inv
               
    def evaluate(self, test_vec, k=10):
        def getHitRatio(ranklist, gtItem):
            for item in ranklist:
                if item == gtItem:
                    return 1
            return 0

        def getNDCG(ranklist, gtItem):
            for i in range(len(ranklist)):
                item = ranklist[i]
                if item == gtItem:
                    return math.log(2) / math.log(i+2)
            return 0
        
        testset = pd.DataFrame(test_vec, columns=['u','i'])
        hits = []
        ndcgs = []
        list_csr = list(set(np.array(testset['u']).tolist()))
        for u in list_csr:
            csrset = np.array(testset[testset['u']==u]).tolist()
            scorelist = []
            positem = csrset[0][1]#first item is positive
            for _, i in csrset:
                scorelist.append([i, self.predict(u,i)])
            #get topk 
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hr = getHitRatio(ranklist, positem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, positem)
            ndcgs.append(ndcg)
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hitratio,ndcg
        
#loading dataset
def getTrainset(filePath):
    trainset = []
    maxu = 0 
    maxi = 0 
    maxr = 0.0
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != "":
            arr = line.split("\t")
            u, i, rating = int(arr[0]), int(arr[1]), float(arr[2])
            trainset.append([int(arr[0]), int(arr[1]), float(arr[2])])
            if rating > maxr: maxr = rating
            if u > maxu: maxu = u
            if i > maxi: maxi = i
            line = fd.readline()
        return trainset, maxr, maxu, maxi

def getTestset(filePath):
    testset = []
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != '':
            arr = line.split('\t')
            u = eval(arr[0])[0]
            testset.append([u, eval(arr[0])[1]])#one postive item
            for i in arr[1:]:
                testset.append([u, int(i)]) #99 negative items
            line = fd.readline()
    return testset    

def getTrainDict(data):
        dataDict = {}
        for i in data:
            dataDict[(i[0], i[1])] = i[2]
        return dataDict
    
def getNegTrain(data, maxi, negNum=4):
        datadict = getTrainDict(data)
        trainneg = []
        for i in data:
            trainneg.append([i[0],i[1],i[2]])
            for t in range(negNum):
                j = np.random.randint(maxi)
                while (i[0], j) in datadict:
                    j = np.random.randint(maxi)
                trainneg.append([i[0], j, 0.0])
        return trainneg
    
if __name__ == '__main__':
    trainset, maxr, maxu, maxi = getTrainset("/data/fjsdata/ctKngBase/ml/ml-1m.train.rating")
    trainneg = getNegTrain(trainset, maxi)
    testset = getTestset("/data/fjsdata/ctKngBase/ml/ml-1m.test.negative")
    print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % (len(trainset), maxu+1, maxi+1, len(trainset)/(maxu*maxr)))

    print ("%3s%20s%20s" % ('K','HR@10', 'NDCG@10'))
    for K in [8, 16, 32, 64]:
        pmf = PMF(num_feat=K)
        valtest = random.sample(trainset,int(0.2*len(trainset)))
        pmf.fit(np.array(trainneg), np.array(valtest))
        hit, ndcg = pmf.evaluate(testset)
        print ("%3d%20.6f%20.6f" % (K, hit, ndcg))

Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 32.9250
  K               HR@10             NDCG@10
Training RMSE: 1.517382, Val RMSE 3.076740
Training RMSE: 1.517375, Val RMSE 3.076729
Training RMSE: 1.517368, Val RMSE 3.076716
Training RMSE: 1.517360, Val RMSE 3.076704
Training RMSE: 1.517354, Val RMSE 3.076693
Training RMSE: 1.517348, Val RMSE 3.076689
Training RMSE: 1.517342, Val RMSE 3.076680
Training RMSE: 1.517335, Val RMSE 3.076671
Training RMSE: 1.517328, Val RMSE 3.076662
Training RMSE: 1.517321, Val RMSE 3.076653
Training RMSE: 1.517314, Val RMSE 3.076642
Training RMSE: 1.517307, Val RMSE 3.076633
Training RMSE: 1.517301, Val RMSE 3.076626
Training RMSE: 1.517295, Val RMSE 3.076620
Training RMSE: 1.517289, Val RMSE 3.076613
Training RMSE: 1.517282, Val RMSE 3.076606
Training RMSE: 1.517275, Val RMSE 3.076599
Training RMSE: 1.517269, Val RMSE 3.076591
Training RMSE: 1.517263, Val RMSE 3.076583
Training RMSE: 1.517255, Val RMSE 3.076572
  8      

In [6]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.18
@function: Implementing PMF
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
           https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf
           Matlab: http://www.utstat.toronto.edu/~rsalakhu/BPMF.html 
@reference: https://github.com/xuChenSJTU/PMF
'''
import numpy as np
import pandas as pd
from numpy.random import RandomState
import copy
import heapq
import math
#define class PMF
class PMF():
    
    def __init__(self, R, K=8):
        # initialize some parameters
        self.lambda_alpha = 1e-2
        self.lambda_beta = 1e-2
        self.momuntum = 0.9
        self.lr = 0.001
        self.iterations = 20
        #handle data
        self.R = R
        self.random_state = RandomState(None)
        self.I = copy.deepcopy(self.R)
        self.I[self.I != 0] = 1
        self.U = 0.1*self.random_state.rand(np.size(R, 0), K)# K is number of latent factors
        self.V = 0.1*self.random_state.rand(np.size(R, 1), K)

    def loss(self):
        # the loss function of the model
        loss = np.sum(self.I*(self.R-np.dot(self.U, self.V.T))**2) + self.lambda_alpha*np.sum(np.square(self.U)) + self.lambda_beta*np.sum(np.square(self.V))
        return loss

    def train(self):
        # monemtum
        momuntum_u = np.zeros(self.U.shape)
        momuntum_v = np.zeros(self.V.shape)

        for it in range(self.iterations):
            # derivate of Vi
            grads_u = np.dot(self.I*(self.R-np.dot(self.U, self.V.T)), -self.V) + self.lambda_alpha*self.U

            # derivate of Tj
            grads_v = np.dot((self.I*(self.R-np.dot(self.U, self.V.T))).T, -self.U) + self.lambda_beta*self.V

            # update the parameters
            momuntum_u = (self.momuntum * momuntum_u) + self.lr * grads_u
            momuntum_v = (self.momuntum * momuntum_v) + self.lr * grads_v
            self.U = self.U - momuntum_u
            self.V = self.V - momuntum_v

            # training evaluation
            #train_loss = self.loss()
            #print('traning iteration:{: d} ,loss:{: f}'.format(it, train_loss))

        return self.U, self.V
    
    def predict(self, data):
        index_data = np.array([[int(ele[0]), int(ele[1])] for ele in data], dtype=int)
        u_features = self.U.take(index_data.take(0, axis=1), axis=0)
        v_features = self.V.take(index_data.take(1, axis=1), axis=0)
        preds_value_array = np.sum(u_features*v_features, 1)
        return preds_value_array.tolist()
    
    
    def evaluate(self, testset):
        def getHitRatio(ranklist, gtItem):
            for item in ranklist:
                if item == gtItem:
                    return 1
            return 0

        def getNDCG(ranklist, gtItem):
            for i in range(len(ranklist)):
                item = ranklist[i]
                if item == gtItem:
                    return math.log(2) / math.log(i+2)
            return 0
        testset = pd.DataFrame(testset, columns=['u','i'])
        hits = []
        ndcgs = []
        list_csr = list(set(np.array(testset['u']).tolist()))
        for u in list_csr:
            csrset = np.array(testset[testset['u']==u]).tolist()
            csrpred = self.predict(csrset)
            scorelist = []
            positem = csrset[0][1]#first item is positive
            for j in range(len(csrset)):  
                scorelist.append([csrset[j][1], csrpred[j]])
            #get topk 
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hr = getHitRatio(ranklist, positem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, positem)
            ndcgs.append(ndcg)
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hitratio,ndcg
        
        
#loading dataset
def getTrainset(filePath):
    trainset = []
    maxu = 0 
    maxi = 0 
    maxr = 0.0
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != "":
            arr = line.split("\t")
            u, i, rating = int(arr[0]), int(arr[1]), float(arr[2])
            trainset.append([int(arr[0]), int(arr[1]), float(arr[2])])
            if rating > maxr: maxr = rating
            if u > maxu: maxu = u
            if i > maxi: maxi = i
            line = fd.readline()
        return trainset, maxr, maxu, maxi

def getTestset(filePath):
    testset = []
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != '':
            arr = line.split('\t')
            u = eval(arr[0])[0]
            testset.append([u, eval(arr[0])[1]])#one postive item
            for i in arr[1:]:
                testset.append([u, int(i)]) #99 negative items
            line = fd.readline()
    return testset    

if __name__ == '__main__':
    trainset, maxr, maxu, maxi = getTrainset("/data/fjsdata/ctKngBase/ml/ml-1m.train.rating")
    testset = getTestset("/data/fjsdata/ctKngBase/ml/ml-1m.test.negative")
    print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % (len(trainset), maxu+1, maxi+1, len(trainset)/(maxu*maxr)))
    R = np.zeros([maxu+1, maxi+1], dtype=np.float32)
    for i in trainset:
        user = int(i[0])
        item = int(i[1])
        rating = float(i[2])
        R[user][item] = rating
    print ("%3s%20s%20s" % ('K','HR@10', 'NDCG@10'))
    for K in [8, 16, 32, 64]:
        pmf = PMF(R, K)
        U, V = pmf.train()
        hit, ndcg = pmf.evaluate(testset)
        print ("%3d%20.6f%20.6f" % (K, hit, ndcg))

Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 32.9250
  K               HR@10             NDCG@10




  8            1.000000            0.333333
 16            1.000000            0.333333
 32            1.000000            0.333333
 64            1.000000            0.333333
