In [1]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.08.09
@function: baseline: SVD 
           paper: https://dl.acm.org/citation.cfm?id=1401944
           Datatset: KnowledgeBase-CC
           Evaluation: RMSE
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self):
        self.trainset, self.testset, self.maxu, self.maxi, self.maxr = self._getDataset_as_list()
        
    def _getDataset_as_list(self):
        #trainset
        filePath = "/data/fjsdata/BMF/kbcc_trainset.csv" 
        data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
        maxu, maxi, maxr = data['csr'].max()+1, data['ke'].max()+1, data['num'].max()
        print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
        trainset = data.values.tolist()
        #testset
        filePath = "/data/fjsdata/BMF/kbcc_testset.csv" 
        data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
        testset = data.values.tolist()
        return trainset, testset, maxu, maxi, maxr 
    
    def list_to_matrix(self, dataset, maxu, maxi):              
        dataMat = np.zeros([maxu, maxi], dtype=np.float32)
        for u,i,r in dataset:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
class SVDBias():
    
    def __init__(self, R):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        
    def train(self, K, alpha=0.001, beta=0.01, epochs=20):
        '''
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        - K (int)       : number of latent dimensions
        -epochs(int)    : number of iterations
        '''
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.epochs = epochs
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
               
        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.epochs):
            np.random.shuffle(self.samples)
            self.sgd()
            if (i+1) % 10 == 0:
                mse = self.mse()
                print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)
    
if __name__ == "__main__":
    ds = DataSet()#loading dataset
    svdb = SVDBias(R=ds.list_to_matrix(ds.trainset,ds.maxu,ds.maxi))
    for K in [8, 16, 32, 64]:
        nR = svdb.train(K=K, alpha=0.001, beta=0.01, epochs=50)
        squaredError = []
        for u,i,r in ds.testset:
            error=r - nR[int(u)][int(i)]
            squaredError.append(error * error)
        rmse =math.sqrt(sum(squaredError) / len(squaredError))
        print("RMSE@{}:{}".format(K, rmse))



Dataset Statistics: Interaction = 2313189, User = 10216, Item = 96324, Sparsity = 0.0024
Iteration: 10 ; error = 2659.9317
Iteration: 20 ; error = 2552.4188
Iteration: 30 ; error = 2446.2029
Iteration: 40 ; error = 2367.4359
Iteration: 50 ; error = 2311.8930
RMSE@8:1.3264332508400583
Iteration: 10 ; error = 2663.1418
Iteration: 20 ; error = 2560.8436
Iteration: 30 ; error = 2440.7683
Iteration: 40 ; error = 2330.8938
Iteration: 50 ; error = 2245.5394
RMSE@16:1.3233861091236074
Iteration: 10 ; error = 2665.2654
Iteration: 20 ; error = 2570.0794
Iteration: 30 ; error = 2453.1469
Iteration: 40 ; error = 2325.4712
Iteration: 50 ; error = 2199.9681
RMSE@32:1.31561269883943
Iteration: 10 ; error = 2667.0549
Iteration: 20 ; error = 2591.4615
Iteration: 30 ; error = 2482.4752
Iteration: 40 ; error = 2359.7267
Iteration: 50 ; error = 2213.7223
RMSE@64:1.3211459785247723


In [1]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.07.31
@function: baseline: SVDBias 
           paper: https://dl.acm.org/citation.cfm?id=1401944
           Datatset: MovieLens-1m:https://grouplens.org/datasets/movielens/  
           Evaluation: RMSE
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self):
        self.trainset, self.testset, self.maxu, self.maxi, self.maxr = self._getDataset_as_list()
        
    def _getDataset_as_list(self):
        #trainset
        filePath = "/data/fjsdata/BMF/ml-1m.train.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        maxu, maxi, maxr = data['user'].max()+1, data['item'].max()+1, data['rating'].max()
        print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
        trainset = data.values.tolist()
        #testset
        filePath = "/data/fjsdata/BMF/ml-1m.test.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        testset = data.values.tolist()
        return trainset, testset, maxu, maxi, maxr 
    
    def list_to_matrix(self, dataset, maxu, maxi):              
        dataMat = np.zeros([maxu, maxi], dtype=np.float32)
        for u,i,r in dataset:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
class SVDBias():
    
    def __init__(self, R):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        
    def train(self, K, alpha=0.001, beta=0.01, epochs=20):
        '''
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        - K (int)       : number of latent dimensions
        -epochs(int)    : number of iterations
        '''
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.epochs = epochs
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
               
        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.epochs):
            np.random.shuffle(self.samples)
            self.sgd()
            if (i+1) % 10 == 0:
                mse = self.mse()
                print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)
    
if __name__ == "__main__":
    ds = DataSet()#loading dataset
    svdb = SVDBias(R=ds.list_to_matrix(ds.trainset,ds.maxu,ds.maxi))
    for K in [8, 16, 32, 64]:
        nR = svdb.train(K=K, alpha=0.001, beta=0.01, epochs=20)
        squaredError = []
        for u,i,r in ds.testset:
            error=r - nR[int(u)][int(i)]
            squaredError.append(error * error)
        rmse =math.sqrt(sum(squaredError) / len(squaredError))
        print("RMSE@{}:{}".format(K, rmse))



Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444
Iteration: 10 ; error = 907.3777
Iteration: 20 ; error = 896.9508
RMSE@8:0.9581820978396207
Iteration: 10 ; error = 907.6063
Iteration: 20 ; error = 897.8403
RMSE@16:0.9570481457791289
Iteration: 10 ; error = 908.0286
Iteration: 20 ; error = 898.8743
RMSE@32:0.9573051491949647
Iteration: 10 ; error = 908.2941
Iteration: 20 ; error = 899.4916
RMSE@64:0.9572796642754959


In [38]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.07.29
@function: baseline SVDBias 
           Datatset: MovieLens-1m:https://grouplens.org/datasets/movielens/  
           Evaluation: RMSE
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self):
        self.trainset, self.testset, self.maxu, self.maxi, self.maxr = self._getDataset_as_list()
        
    def _getDataset_as_list(self):
        #trainset
        filePath = "/data/fjsdata/BMF/ml-1m.train.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        maxu, maxi, maxr = data['user'].max()+1, data['item'].max()+1, data['rating'].max()
        print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
        trainset = data.values.tolist()
        #testset
        filePath = "/data/fjsdata/BMF/ml-1m.test.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        testset = data.values.tolist()
        return trainset, testset, maxu, maxi, maxr 
    
    def list_to_matrix(self, dataset, maxu, maxi):              
        dataMat = np.zeros([maxu, maxi], dtype=np.float32)
        for u,i,r in dataset:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
class SVDBias():
    
    def __init__(self, R, num_ng=2):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        - num_ng (int)  : number of negative items
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.num_ng = num_ng
        
        # Create a list of training samples
        pos_samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        #smapling the negative items
        neg_samples = random.sample([
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] == 0
        ], len(pos_samples)*num_ng)
        
        self.samples = pos_samples + neg_samples
        
    def train(self, K, alpha=0.001, beta=0.01, epochs=20):
        '''
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        - K (int)       : number of latent dimensions
        -epochs(int)    : number of iterations
        '''
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.epochs = epochs
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
               
        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.epochs):
            np.random.shuffle(self.samples)
            self.sgd()
            if (i+1) % 10 == 0:
                mse = self.mse()
                print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)
    
if __name__ == "__main__":
    ds = DataSet()#loading dataset
    svdb = SVDBias(R=ds.list_to_matrix(ds.trainset,ds.maxu,ds.maxi), num_ng=2)#negative sample ratio
    for K in [8, 16, 32, 64]:
        nR = svdb.train(K=K, alpha=0.001, beta=0.01, epochs=20)
        squaredError = []
        for u,i,r in ds.testset:
            error=r - nR[int(u)][int(i)]
            squaredError.append(error * error)
        rmse =math.sqrt(sum(squaredError) / len(squaredError))
        print("RMSE@{}:{}".format(K, rmse))

Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444
Iteration: 10 ; error = 1771.9060
Iteration: 20 ; error = 1661.8415
RMSE@8:2.2972489581281974
Iteration: 10 ; error = 1773.6658
Iteration: 20 ; error = 1657.6516
RMSE@16:2.2911449633731853
Iteration: 10 ; error = 1776.2692
Iteration: 20 ; error = 1683.9245
RMSE@32:2.3021983437205464
Iteration: 10 ; error = 1777.9752
Iteration: 20 ; error = 1700.0494
RMSE@64:2.30657219728279


In [28]:
#dataset:Amazon Product-dm(digital music):http://jmcauley.ucsd.edu/data/amazon/index.html
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
filePath = '/data/tmpexec/ratings_Baby.csv'
data = pd.read_csv(filePath, sep=',', header=None, names=['user', 'item', 'rating','time'], \
                                 usecols=[0, 1, 2,3], dtype={0: np.str, 1: np.str, 2: np.float,3:str})
print (data.head())
ui=data[['user','item']]
le = LabelEncoder()
ui = ui.apply(le.fit_transform)#label the user and item no
print (ui['user'].max())
print (ui['item'].max())

             user        item  rating        time
0  A28O3NP6WR5517  0188399313     5.0  1369612800
1   AX0M1Z6ZWO52J  0188399399     5.0  1365465600
2  A1KD7N84L7NIUT  0188399518     4.0  1392336000
3  A29CUDEIF4X1UO  0188399518     3.0  1373241600
4  A32592TYN6C9EM  0316967297     4.0  1378425600
531889
64425
