In [None]:
'''
@author: Jason.F
@data: 2019.07.13
@function: Implementing IRT-MF 
           Setting KnowledgeBase-CC 
           Evaluating by hitradio,ndcg
'''
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import heapq

#1. loading the KnowledgeBase dataset.
trainset = pd.read_csv("/data/fjsdata/ctKngBase/kbcc_trainset.csv", sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
trainset['num']=trainset['num'].apply(lambda x: 1 if float(x)>0.0 else 0)
print ('Trainset shape is:%d rows and %d columns'%(trainset.shape[0],trainset.shape[1]))
#testset includes 100 items for every user, one item is positive and other 99 is negtive items.
testset = pd.read_csv("/data/fjsdata/ctKngBase/kbcc_testset.csv", sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
testset['num']=testset['num'].apply(lambda x: 1 if float(x)>0.0 else 0)
print ('Testset shape is:%d rows and %d columns'%(testset.shape[0],testset.shape[1]))
csrNum = trainset['csr'].max()+1
keNum = trainset['ke'].max()+1
print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % 
      (trainset.shape[0], csrNum, keNum, trainset.shape[0]/(csrNum*keNum)) )

#2. IRT-MF class
class IRTMF():
    
    def __init__(self, R, num_ng=4):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        - num_ng (int)  : number of negative items
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.num_ng = num_ng
        
        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]

        #smapling the negative items
        for x in self.samples:
            u = x[0]
            for t in range(self.num_ng):
                j = np.random.randint(self.num_items)
                #while (u, j) in self.R:
                while self.R[u, j] > 0:
                    j = np.random.randint(self.num_items)
                self.samples.append([u, j, 0])

    def train(self, K, alpha=0.001, beta=0.01, epochs=20):
        '''
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        - K (int)       : number of latent dimensions
        -epochs(int)    : number of iterations
        '''
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.epochs = epochs
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
               
        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.epochs):
            np.random.shuffle(self.samples)
            self.sgd()
            #if (i+1) % 10 == 0:
            #    mse = self.mse()
            #    print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            prob = self.IRT_Rasch(self.a_u[i], self.d_i[j])
            e = (r - prediction-prob)
            
            # Update biases
            
            self.a_u[i] += self.alpha * (e * prob*(prob-1) - self.beta * self.a_u[i])
            self.d_i[j] += self.alpha * (e * prob*(1-prob) - self.beta * self.d_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.IRT_Rasch(self.a_u[i], self.d_i[j]) + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        nR = np.zeros((self.num_users, self.num_items))
        for i in range(self.num_users):
            for j in range(self.num_items):
                nR[i,j] = self.IRT_Rasch(self.a_u[i], self.d_i[j]) + self.P[i, :].dot(self.Q[j, :].T)
        return nR
    
    def IRT_Rasch(self, a, d):
        """
        Compute the probability of respons
        """
        x = a-d
        if x < 0:
            return 1 - 1/(1 + math.exp(x))
        else:
            return 1/(1 + math.exp(-x))
        
#3. training and evaluating 
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

R = np.zeros((csrNum, keNum))
for index, row in trainset.iterrows(): 
    R[int(row['csr'])][int(row['ke'])] = float(row['num'])
print ("%3s%20s%20s" % ('K','HR@10', 'NDCG@10'))
mdl = IRTMF(R=R, num_ng=4)# K is latent factors
for K in [8,16,32,64]:#iterations epoches
    nR = mdl.train(K=K, alpha=0.001, beta=0.01, epochs=20)
    hits = []
    ndcgs = []
    list_csr = list(set(np.array(testset['csr']).tolist()))
    for csr in list_csr:
        csrset = testset[testset['csr']==csr]
        scorelist = []
        positem = 0
        for u, i, r in np.array(csrset).tolist():   
            if float(r)>0.0:#one positive item
                scorelist.append([int(i),nR[int(u),int(i)]])
                positem = int(i) 
            else:# 99 negative items
                scorelist.append([int(i),nR[int(u),int(i)]])
        map_item_score = {}
        for item, rate in scorelist: #turn dict
            map_item_score[item] = rate
        ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
        hr = getHitRatio(ranklist, positem)
        hits.append(hr)
        ndcg = getNDCG(ranklist, positem)
        ndcgs.append(ndcg)
    hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
    print ("%3d%20.6f%20.6f" % (K, hitratio, ndcg))

Trainset shape is:2547452 rows and 3 columns
Testset shape is:1021600 rows and 3 columns
Dataset Statistics: Interaction = 2547452, User = 10216, Item = 96324, Sparsity = 0.0026
  K               HR@10             NDCG@10


In [2]:
'''
@author: Jason.F
@data: 2019.07.12
@function: Implementing SVD(surprise) 
           Setting KnowledgeBase-cc 
           Evaluating by hitradio,ndcg
'''
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import heapq
import surprise as sp

#1. loading the KnowledgeBase dataset.
trainset = pd.read_csv("/data/fjsdata/ctKngBase/trainset.csv", sep='|', low_memory=False)
trainset['num']=trainset['num'].apply(lambda x: 1 if float(x)>0.0 else 0)
print ('Trainset shape is:%d rows and %d columns'%(trainset.shape[0],trainset.shape[1]))
#testset includes 100 items for every user, one item is positive and other 99 is negtive items.
testset = pd.read_csv("/data/fjsdata/ctKngBase/testset.csv", sep='|', low_memory=False)
testset['num']=testset['num'].apply(lambda x: 1 if float(x)>0.0 else 0)
print ('Testset shape is:%d rows and %d columns'%(testset.shape[0],testset.shape[1]))
csrNum = trainset['csr'].max()+1
keNum = trainset['ke'].max()+1
print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % 
      (trainset.shape[0], csrNum, keNum, trainset.shape[0]/(csrNum*keNum)) )

#2. Transforming into data format of surprise and spliting the train-set and test-set
# The columns must correspond to user id, item id and ratings (in that order).
reader = sp.Reader(rating_scale=(0, 1))
spdata = sp.Dataset.load_from_df(trainset,reader)
trainset = spdata.build_full_trainset()
testset = np.array(testset).tolist()

#3.training and evaluating 
def getMSE(TRating,PRating):
    error = TRating-PRating
    return error*error

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0
print ("%3s%20s%20s" % ('K','HitRatio', 'NDCG'))
for K in [8,16,32,64]:#iterations epoches
    algo = sp.SVD(n_factors=K, n_epochs=20, lr_all=0.001, reg_all=0.01 )#NMF,SVDpp
    algo.fit(trainset)
    #print (algo.predict(str(1),str(1), r_ui=0, verbose=True)) 
    predictions = algo.test(testset)#testset include one positive and 99 negtive sample of every user.
    user_iid_true_est = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_iid_true_est[uid].append((iid, true_r, est))
    hits = []
    ndcgs = []
    for uid, iid_ratings in user_iid_true_est.items():
        # Sort user ratings by estimated value
        #iid_ratings.sort(key=lambda x: x[2], reverse=True) #sorted by est
        scorelist = []
        positem = 0
        for iid, ture_r, est in iid_ratings:
            if (ture_r+1)>0.0:#one positive item
                scorelist.append([iid,est])
                positem = iid 
            else:# 99 negative items
                scorelist.append([iid,est])
        map_item_score = {}
        for item, rate in scorelist: #turn dict
            map_item_score[item] = rate
        ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
        hr = getHitRatio(ranklist, positem)
        hits.append(hr)
        ndcg = getNDCG(ranklist, positem)
        ndcgs.append(ndcg)
    hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
    print ("%3d%20.6f%20.6f" % (K, hitratio, ndcg))

Trainset shape is:2547452 rows and 3 columns
Testset shape is:1021600 rows and 3 columns
Dataset Statistics: Interaction = 2547452, User = 10216, Item = 96324, Sparsity = 0.0026
  K            HitRatio                NDCG
  8            0.502643            0.502643
 16            0.507048            0.507048
 32            0.499315            0.499315
 64            0.503622            0.503622


In [None]:
'''
@author: Jason.F
@data: 2019.07.09
@function: Implementing IRT-MF and Setting KnowledgeBase as baseline by rmse,hitradio,ndcg
'''
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import heapq

#1. loading the KnowledgeBase dataset.
trainset = pd.read_csv("/data/fjsdata/ctKngBase/trainset.csv", sep='|', low_memory=False)
print ('Trainset shape is:%d rows and %d columns'%(trainset.shape[0],trainset.shape[1]))
#testset includes 100 items for every user, one item is positive and other 99 is negtive items.
testset = pd.read_csv("/data/fjsdata/ctKngBase/testset.csv", sep='|', low_memory=False)
print ('Testset shape is:%d rows and %d columns'%(testset.shape[0],testset.shape[1]))
csrNum = trainset['csr'].max()+1
keNum = trainset['ke'].max()+1
print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % 
      (trainset.shape[0], csrNum, keNum, trainset.shape[0]/(csrNum*keNum)) )

#2. IRT-MF class
class IRTMF():
    
    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """
        
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.a_u = np.zeros(self.num_users)#users' ability
        self.d_i = np.zeros(self.num_items)#items' difficulty
        
        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        
        # Perform stochastic gradient descent for number of iterations
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            #if (i+1) % 10 == 0:
            #    mse = self.mse()
            #    print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            prob = self.IRT_Rasch(self.a_u[i], self.d_i[j])
            self.a_u[i] += self.alpha * (e * prob*(prob-1) - self.beta * self.a_u[i])
            self.d_i[j] += self.alpha * (e * prob*(1-prob) - self.beta * self.d_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.IRT_Rasch(self.a_u[i], self.d_i[j]) + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        nR = np.zeros((self.num_users, self.num_items))
        for i in range(self.num_users):
            for j in range(self.num_items):
                nR[i,j] = self.IRT_Rasch(self.a_u[i], self.d_i[j]) + self.P[i, :].dot(self.Q[j, :].T)
        return nR
    
    def IRT_Rasch(self, a, d):
        """
        Compute the probability of respons
        """
        x = a-d
        if x < 0:
            return 1 - 1/(1 + math.exp(x))
        else:
            return 1/(1 + math.exp(-x))
        
#3. training and evaluating 
def getMSE(TRating,PRating):
    error = TRating-PRating
    return error*error

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

R = np.zeros((csrNum, keNum))
for index, row in trainset.iterrows(): 
    R[int(row['csr'])][int(row['ke'])] = float(row['num'])
print ("%3s%20s%20s%20s%20s" % ('K','Iterations','RMSE', 'HitRatio', 'NDCG'))
for K in [2,50,100]:#latent factors
    for iterations in [2,20,50,100]:#iterations epoches
        mdl = IRTMF(R=R, K=K, alpha=0.001, beta=0.01, iterations=iterations)# K is latent factors
        nR = mdl.train()
        mses = []
        hits = []
        ndcgs = []
        list_csr = list(set(np.array(testset['csr']).tolist()))
        for csr in list_csr:
            csrset = testset[testset['csr']==csr]
            scorelist = []
            positem = 0
            for u, i, r in np.array(csrset).tolist():   
                if float(r)>0.0:#one positive item
                    mses.append(getMSE(float(r),nR[int(u),int(i)]))
                    scorelist.append([int(i),nR[int(u),int(i)]])
                    positem = int(i) 
                else:# 99 negative items
                    scorelist.append([int(i),nR[int(u),int(i)]])
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hr = getHitRatio(ranklist, positem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, positem)
            ndcgs.append(ndcg)
        rmse,hitratio,ndcg = math.sqrt(sum(mses) / len(mses)) ,np.array(hits).mean(), np.array(ndcgs).mean()
        print ("%3d%20d%20.6f%20.6f%20.6f" % (K, iterations, rmse, hitratio, ndcg))

Trainset shape is:2547452 rows and 3 columns
Testset shape is:1021600 rows and 3 columns
Dataset Statistics: Interaction = 2547452, User = 10216, Item = 96324, Sparsity = 0.0026
  K          Iterations                RMSE            HitRatio                NDCG
  2                   2            0.592674            0.092991            0.041560
  2                  20            0.608159            0.133516            0.074288
  2                  50            0.469229            0.066269            0.038255
  2                 100            0.343965            0.033379            0.018839
 50                   2            0.529410            0.713293            0.515024
 50                  20            0.587791            0.412490            0.245202
 50                  50            0.381476            0.155247            0.097560
 50                 100            0.283121            0.092306            0.056403
100                   2            0.529443            0.771926   

In [60]:
'''
@author: Jason.F
@data: 2019.07.09
@function: Implementing SVD(surprise) and Setting KnowledgeBase as baseline by rmse,hitradio,ndcg
'''
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import heapq
import surprise as sp

#1. loading the KnowledgeBase dataset.
trainset = pd.read_csv("/data/fjsdata/ctKngBase/trainset.csv", sep='|', low_memory=False)
print ('Trainset shape is:%d rows and %d columns'%(trainset.shape[0],trainset.shape[1]))
#testset includes 100 items for every user, one item is positive and other 99 is negtive items.
testset = pd.read_csv("/data/fjsdata/ctKngBase/testset.csv", sep='|', low_memory=False)
print ('Testset shape is:%d rows and %d columns'%(testset.shape[0],testset.shape[1]))
csrNum = trainset['csr'].max()+1
keNum = trainset['ke'].max()+1
print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % 
      (trainset.shape[0], csrNum, keNum, trainset.shape[0]/(csrNum*keNum)) )

#2. Transforming into data format of surprise and spliting the train-set and test-set
# The columns must correspond to user id, item id and ratings (in that order).
reader = sp.Reader(rating_scale=(0, 1))
spdata = sp.Dataset.load_from_df(trainset,reader)
trainset = spdata.build_full_trainset()
testset = np.array(testset).tolist()

#3.training and evaluating 
def getMSE(TRating,PRating):
    error = TRating-PRating
    return error*error

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0
print ("%3s%20s%20s%20s%20s" % ('K','Iterations','RMSE', 'HitRatio', 'NDCG'))
for K in [2,50,100,200]:#latent factors
    for iterations in [2,20,50,100]:#iterations epoches
        algo = sp.SVD(n_factors=K, n_epochs=iterations, lr_all=0.001, reg_all=0.01 )#NMF,SVDpp
        algo.fit(trainset)
        #print (algo.predict(str(1),str(1), r_ui=0, verbose=True)) 
        predictions = algo.test(testset)#testset include one positive and 99 negtive sample of every user.
        user_iid_true_est = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            user_iid_true_est[uid].append((iid, true_r, est))
        mses = []
        hits = []
        ndcgs = []
        for uid, iid_ratings in user_iid_true_est.items():
            # Sort user ratings by estimated value
            #iid_ratings.sort(key=lambda x: x[2], reverse=True) #sorted by est
            scorelist = []
            positem = 0
            for iid, ture_r, est in iid_ratings:
                if (ture_r+1)>0.0:#one positive item
                    mses.append(getMSE((ture_r+1),est))
                    scorelist.append([iid,est])
                    positem = iid 
                else:# 99 negative items
                    scorelist.append([iid,est])
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hr = getHitRatio(ranklist, positem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, positem)
            ndcgs.append(ndcg)
        rmse,hitratio,ndcg = math.sqrt(sum(mses) / len(mses)) ,np.array(hits).mean(), np.array(ndcgs).mean()
        print ("%3d%20d%20.6f%20.6f%20.6f" % (K, iterations, rmse, hitratio, ndcg))

Trainset shape is:2547452 rows and 3 columns
Testset shape is:1021600 rows and 3 columns
Dataset Statistics: Interaction = 2547452, User = 10216, Item = 96324, Sparsity = 0.0026
  K          Iterations                RMSE            HitRatio                NDCG
  2                   2            0.054436            0.235611            0.173626
  2                  20            0.054858            0.313528            0.229820
  2                  50            0.053164            0.388312            0.270784
  2                 100            0.051611            0.450764            0.307749
 50                   2            0.075428            0.137138            0.071148
 50                  20            0.068491            0.149569            0.080760
 50                  50            0.059963            0.139879            0.079384
 50                 100            0.054146            0.139781            0.082243
100                   2            0.091649            0.125587   

In [None]:
'''
@author: Jason.F
@data: 2019.07.11
@function: Implementing IRT-MF and Setting KnowledgeBase as baseline by rmse,hitradio,ndcg
'''
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import heapq
import scipy.stats

#1. loading the KnowledgeBase dataset.
trainset = pd.read_csv("/data/fjsdata/ctKngBase/trainset.csv", sep='|', low_memory=False)
print ('Trainset shape is:%d rows and %d columns'%(trainset.shape[0],trainset.shape[1]))
#testset includes 100 items for every user, one item is positive and other 99 is negtive items.
testset = pd.read_csv("/data/fjsdata/ctKngBase/testset.csv", sep='|', low_memory=False)
print ('Testset shape is:%d rows and %d columns'%(testset.shape[0],testset.shape[1]))
csrNum = trainset['csr'].max()+1
keNum = trainset['ke'].max()+1
print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % 
      (trainset.shape[0], csrNum, keNum, trainset.shape[0]/(csrNum*keNum)) )

#2. IRT-MF class
class IRTMF():
    
    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """
        
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.a_u = np.ones(self.num_users)#users' ability
        self.d_i = np.ones(self.num_items)#items' difficulty
        
        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        
        # Perform stochastic gradient descent for number of iterations
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            #if (i+1) % 10 == 0:
            #    mse = self.mse()
            #    print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.P.dot(self.Q.T), self.a_u, self.d_i

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            pdf = self.IRT_Normal(r,self.a_u[i],self.d_i[j])#calculate the probability of response.
            e = (pdf - prediction)
            
            # Update biases
            dev_au = pdf * ((r-self.d_i[j])**2/self.a_u[i]-math.sqrt(2*math.pi))*self.a_u[i]
            dev_di = pdf * (r-self.d_i[j])/self.a_u[i]
            self.a_u[i] += self.alpha * (e * dev_au - self.beta * self.a_u[i])
            self.d_i[j] += self.alpha * (e * dev_di - self.beta * self.d_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        nR = self.P.dot(self.Q.T)
        return nR
    
    def IRT_Normal(self, r, a, d):
        """
        Compute the probability of respons,Normal
        """
        return scipy.stats.norm(d, a).pdf(r)
        
#3. training and evaluating 
def getMSE(TRating,PRating):
    error = TRating-PRating
    return error*error

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

R = np.zeros((csrNum, keNum))
for index, row in trainset.iterrows(): 
    R[int(row['csr'])][int(row['ke'])] = float(row['num'])
print ("%3s%20s%20s%20s%20s" % ('K','Iterations','RMSE', 'HitRatio', 'NDCG'))
for K in [2,50,100]:#latent factors
    for iterations in [1,20,50,100]:#iterations epoches
        mdl = IRTMF(R=R, K=K, alpha=0.001, beta=0.01, iterations=iterations)# K is latent factors
        nR,au,di = mdl.train()
        mses = []
        hits = []
        ndcgs = []
        list_csr = list(set(np.array(testset['csr']).tolist()))
        for csr in list_csr:
            csrset = testset[testset['csr']==csr]
            scorelist = []
            positem = 0
            for u, i, r in np.array(csrset).tolist():   
                if float(r)>0.0:#one positive item
                    true_r = mdl.IRT_Normal(float(r),au[int(u)],di[int(i)])
                    mses.append(getMSE(true_r,nR[int(u),int(i)]))
                    scorelist.append([int(i),nR[int(u),int(i)]])
                    positem = int(i) 
                else:# 99 negative items
                    scorelist.append([int(i),nR[int(u),int(i)]])
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hr = getHitRatio(ranklist, positem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, positem)
            ndcgs.append(ndcg)
        rmse,hitratio,ndcg = math.sqrt(sum(mses) / len(mses)) ,np.array(hits).mean(), np.array(ndcgs).mean()
        print ("%3d%20d%20.6f%20.6f%20.6f" % (K, iterations, rmse, hitratio, ndcg))

Trainset shape is:2547452 rows and 3 columns
Testset shape is:1021600 rows and 3 columns
Dataset Statistics: Interaction = 2547452, User = 10216, Item = 96324, Sparsity = 0.0026
  K          Iterations                RMSE            HitRatio                NDCG
  2                   1            0.395768            0.082028            0.035729
  2                  20                 nan            1.000000            0.333333


In [39]:
import numpy as np
import pandas as pd
#Loading the dataset and Excluding the outliers
kbdata = pd.read_csv("/data/fjsdata/ctKngBase/kb.csv", sep='|', low_memory=False)
kbdata = kbdata.loc[(kbdata['num']<200)]#seven months, one per day
num_max=kbdata['num'].max()
num_min=kbdata['num'].min()
kbdata['num']=kbdata['num'].apply(lambda x: (x-num_min+1)*1.0/(num_max-num_min+1))
trainset = kbdata[['csr','ke','num']]
print ('Trainset shape is:%d rows and %d columns'%(trainset.shape[0],trainset.shape[1]))
testset = kbdata.sample(frac=0.1)
print ('Testset shape is:%d rows and %d columns'%(testset.shape[0],testset.shape[1]))

Trainset shape is:2547452 rows and 3 columns
Testset shape is:254745 rows and 3 columns


In [47]:
#IRT-MF for knowledgeBase dataset 
import numpy as np
import pandas as pd
import math
class IRTMF():
    
    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """
        
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.a_u = np.zeros(self.num_users)#users' ability
        self.d_i = np.zeros(self.num_items)#items' difficulty
        
        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        
        # Perform stochastic gradient descent for number of iterations
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            if (i+1) % 10 == 0:
                mse = self.mse()
                print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            prob = self.IRT_Rasch(self.a_u[i], self.d_i[j])
            self.a_u[i] += self.alpha * (e * prob*(prob-1) - self.beta * self.a_u[i])
            self.d_i[j] += self.alpha * (e * prob*(1-prob) - self.beta * self.d_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.IRT_Rasch(self.a_u[i], self.d_i[j]) + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        nR = np.zeros((self.num_users, self.num_items))
        for i in range(self.num_users):
            for j in range(self.num_items):
                nR[i,j] = self.IRT_Rasch(self.a_u[i], self.d_i[j]) + self.P[i, :].dot(self.Q[j, :].T)
        return nR
    
    def IRT_Rasch(self, a, d):
        """
        Compute the probability of respons
        """
        x = a-d
        if x < 0:
            return 1 - 1/(1 + math.exp(x))
        else:
            return 1/(1 + math.exp(-x))

# Construct matrix and train model
N =len(trainset['csr'].unique())+1
M =len(trainset['ke'].unique())+1
R = np.zeros((N, M))
for index, row in trainset.iterrows(): 
    R[int(row['csr'])][int(row['ke'])] = row['num']
mf = IRTMF(R, K=100, alpha=0.001, beta=0.01, iterations=100)
nR = mf.train() #return the predicted matrix
#evaluate model
squaredError = []
for _, row in testset.iterrows(): 
    error=float(row['num'])-float(nR[int(row['csr'])][int(row['ke'])])
    squaredError.append(error * error)
RMSE =math.sqrt(sum(squaredError) / len(squaredError))
print(RMSE)

Iteration: 10 ; error = 1172.5074
Iteration: 20 ; error = 958.0748
Iteration: 30 ; error = 800.2448
Iteration: 40 ; error = 575.6074
Iteration: 50 ; error = 437.3844
Iteration: 60 ; error = 366.4029
Iteration: 70 ; error = 321.0410
Iteration: 80 ; error = 288.5642
Iteration: 90 ; error = 263.7157
Iteration: 100 ; error = 243.8924
0.15255096782684383


In [40]:
import surprise as sp
import pandas as pd
import numpy as np
import math
from collections import defaultdict
#1.Transforming into data format of surprise and spliting the train-set and test-set
# The columns must correspond to user id, item id and ratings (in that order).
reader = sp.Reader(rating_scale=(0, 1))
spdata = sp.Dataset.load_from_df(trainset,reader)
# sampling random trainset and testset, and test set is made of 10% of the ratings.
#trainset, testset = sp.model_selection.train_test_split(spdata, test_size=.1)
trainset_sp = spdata.build_full_trainset()
testset_sp = np.array(testset).tolist()

#2.Training the model and predicting ratings for the testset
algo = sp.SVD()#NMF,SVDpp,n_factors=100
algo.fit(trainset_sp)
predictions = algo.test(testset_sp)#testset include positive and negtive sample.

#3.measuring the performance of SVD by precision, recall and  NDCG
#print ('RMSE of testset is:%.8f'%(sp.accuracy.rmse(predictions)))
RMSE = sp.accuracy.rmse(predictions)

RMSE: 0.9982


In [41]:
#MF for knowledgeBase dataset 
import numpy as np
import pandas as pd
import math
class IRTMF():
    
    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """
        
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
        
        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        
        # Perform stochastic gradient descent for number of iterations
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            if (i+1) % 10 == 0:
                mse = self.mse()
                print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)

# Construct matrix and train model
N =len(trainset['csr'].unique())+1
M =len(trainset['ke'].unique())+1
R = np.zeros((N, M))
for index, row in trainset.iterrows(): 
    R[int(row['csr'])][int(row['ke'])] = row['num']
mf = IRTMF(R, K=100, alpha=0.001, beta=0.01, iterations=100)
nR = mf.train() #return the predicted matrix
#evaluate model
squaredError = []
for _, row in testset.iterrows(): 
    error=float(row['num'])-float(nR[int(row['csr'])][int(row['ke'])])
    squaredError.append(error * error)
RMSE =math.sqrt(sum(squaredError) / len(squaredError))
print(RMSE)

Iteration: 10 ; error = 82.9026
Iteration: 20 ; error = 82.1781
Iteration: 30 ; error = 81.9153
Iteration: 40 ; error = 81.7769
Iteration: 50 ; error = 81.6866
Iteration: 60 ; error = 81.6230
Iteration: 70 ; error = 81.5738
Iteration: 80 ; error = 81.5325
Iteration: 90 ; error = 81.4958
Iteration: 100 ; error = 81.4597
0.05180268517188995


In [11]:
#Baseline for knowledgeBase dataset 
import surprise as sp
import pandas as pd
import numpy as np
import math
from collections import defaultdict

#1.Loading the dataset and Excluding the outliers
kbdata = pd.read_csv("/data/fjsdata/ctKngBase/kb.csv", sep='|', low_memory=False)
kbdata = kbdata.loc[(kbdata['num']<200)]#seven months, one per day
num_max=kbdata['num'].max()
num_min=kbdata['num'].min()
kbdata['num']=kbdata['num'].apply(lambda x: (x-num_min+1)*1.0/(num_max-num_min+1) )
print ('Dataset shape is:%d rows and %d columns'%(kbdata.shape[0],kbdata.shape[1]))
#2.Transforming into data format of surprise and spliting the train-set and test-set
# The columns must correspond to user id, item id and ratings (in that order).
reader = sp.Reader(rating_scale=(0, 1))
spdata = sp.Dataset.load_from_df(kbdata[['csr', 'ke', 'num']],reader)
# sampling random trainset and testset, and test set is made of 10% of the ratings.
#trainset, testset = sp.model_selection.train_test_split(spdata, test_size=.1)
trainset = spdata.build_full_trainset()
testset = trainset.build_testset()

#3.Training the model and predicting ratings for the testset
algo = sp.SVD()#NMF,SVDpp,n_factors=100
algo.fit(trainset)
predictions = algo.test(testset)#testset include positive and negtive sample.

#4.measuring the performance of SVD by precision, recall and  NDCG
#print ('RMSE of testset is:%.8f'%(sp.accuracy.rmse(predictions)))
def calc_dcg(items):
    dcg = 0
    i = 0
    for item in items:
        i += 1
        dcg += (math.pow(2, item) - 1)/ math.log(1 + i, 2)
    return dcg
def index_at_k(predictions, k, threshold=0.1):
   #Return precision and recall at k metrics for each user.
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    ndcgs =dict()
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((true_r > threshold) for (_, true_r) in user_ratings)
        # Number of recommended items in top k
        n_rec_k = sum((est > threshold) for (est, _) in user_ratings[:k])
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r > threshold) and (est > threshold)) for (est, true_r) in user_ratings[:k])
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        #true ratings of recommended items in top k
        l_rec_k = [true_r for (_,true_r) in user_ratings[:k]]
        dcg = calc_dcg(l_rec_k)
        #l_rec_k.sort(reverse=True)
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        l_rel_k = [true_r for (_,true_r) in user_ratings[:k]]
        idcg = calc_dcg(l_rel_k)
        ndcgs[uid]=dcg*1.0/idcg 
    return precisions, recalls, ndcgs

print ("%3s%20s%20s%20s" % ('K','Precisions','Recalls','NDCG'))
for k in [5,10,15,20]:#latent factor
    precisions, recalls, ndcgs = index_at_k(predictions, k=k)
    # Precision and recall can then be averaged over all users
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    ndcg = sum(ndcg for ndcg in ndcgs.values()) / len(ndcgs)
    print ("%3s%20.8f%20.8f%20.8f" % (k, precision, recall, ndcg))
RMSE = sp.accuracy.rmse(pred)

Dataset shape is:2547452 rows and 3 columns
  K          Precisions             Recalls                NDCG
  5          0.46459149          0.55998973          0.60620035
 10          0.44782239          0.61048597          0.65218110
 15          0.44397245          0.62228409          0.67561003
 20          0.44310821          0.62424625          0.69222003
RMSE: 1.2989
