In [18]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.07.29
@function: baseline SVDBias 
           Datatset: Movielen-1m, Pinterest-20, KnowledgeBase-CC 
           Evaluation: RMSE
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self, fileName):
        self.dataList, self.maxu, self.maxi = self.getDataset_as_list(fileName)
        
    def getDataset_as_list(self, fileName):
        if (fileName == 'ml-1m') or (fileName == 'pinterest-20'):
            filePath = "/data/fjsdata/ctKngBase/ml/"+fileName+".train.rating" 
            data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
            #data['rating']=data['rating'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
        if (fileName == 'kb-cc'):
            filePath = "/data/fjsdata/ctKngBase/kbcc_trainset.csv"
            data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
            data['num']=data['num'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
    
    def list_to_matrix(self):              
        dataMat = np.zeros([self.maxu, self.maxi], dtype=np.float32)
        for u,i,r in self.dataList:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)

In [19]:
class SVDBias():
    
    def __init__(self, R, num_ng=2):
        """
        Perform matrix factorization to predict empty entries in a matrix.     
        Arguments
        - R (ndarray)   : user-item rating matrix
        - num_ng (int)  : number of negative items
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.num_ng = num_ng
        
        # Create a list of training samples
        pos_samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        #smapling the negative items
        neg_samples = random.sample([
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] == 0
        ], len(pos_samples)*num_ng)
        
        self.samples = pos_samples + neg_samples
        
    def train(self, K, alpha=0.001, beta=0.01, epochs=20):
        '''
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        - K (int)       : number of latent dimensions
        -epochs(int)    : number of iterations
        '''
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.epochs = epochs
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
               
        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.epochs):
            np.random.shuffle(self.samples)
            self.sgd()
            if (i+1) % 10 == 0:
                mse = self.mse()
                print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return self.full_matrix()

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)

In [20]:
if __name__ == "__main__":
    for fileName in ['ml-1m', 'pinterest-20', 'kb-cc']:
        dataset = DataSet(fileName=fileName)#loading dataset
        testset = random.sample(dataset.dataList, len(dataset.dataList)//10)#10% for test
        svdb = SVDBias(R=dataset.list_to_matrix(), num_ng=2)#negative sample ratio
        for K in [8, 16, 32, 64]:
            nR = svdb.train(K=K, alpha=0.001, beta=0.01, epochs=20)
            squaredError = []
            for u,i,r in testset:
                error=r - nR[int(u)][int(i)]
                squaredError.append(error * error)
            rmse =sqrt(sum(squaredError) / len(squaredError))
            print("RMSE@{}:{} at Dataset {}".format(K, rmse, fileName))

Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444
Iteration: 10 ; error = 1770.5244
Iteration: 20 ; error = 1660.7947


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [22]:
squaredError = []
for u,i,r in testset:
    error=r - nR[int(u)][int(i)]
    squaredError.append(error * error)
rmse =math.sqrt(sum(squaredError) / len(squaredError))
print("RMSE@{}:{} at Dataset {}".format(K, rmse, fileName))

RMSE@8:1.6624953850640736 at Datasetml-1m
