In [1]:
import sys
import time
import heapq
import math
import gc
import numpy as np
import pandas as pd
from collections import defaultdict
import surprise as sp
import copy
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Optimizer
from sklearn.model_selection import KFold
from torchvision import datasets, transforms
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score,mean_squared_error
print (torch.cuda.is_available())
print (torch.version.cuda)
print (torch.cuda.get_device_name(torch.cuda.current_device()))

True
9.0.176
GeForce RTX 2080 Ti


In [2]:
#Evaluation of metrics
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0
#dataset
class DataSet_1M(object):
    def __init__(self, negNum=1):
        self.trainList, self.shape = self._getTrainData()
        self.trainDict = self._getTrainDict()
        self.trainMat = self._getTrainMatrix()
        self.trainset = self._getInstances(negNum)#sample negative samples
        self.testset = self._getTest()
        
    def _getTrainData(self):
        data = []
        filePath = '/data/fjsdata/ResData2019/BayesianMF/ml-1m.train.rating'
        u = 0
        i = 0
        maxr = 0.0
        with open(filePath, 'r') as f:
            for line in f:
                if line:
                    lines = line[:-1].split("\t")
                    user = int(lines[0])
                    movie = int(lines[1])
                    score = float(lines[2])
                    data.append((user, movie, score))
                    if user > u:u = user
                    if movie > i:i = movie
                    if score > maxr:maxr = score
        self.maxRate = maxr
        print("Loading Success!\n"
                  "Data Info:\n"
                  "\tUser Num: {}\n"
                  "\tItem Num: {}\n"
                  "\tData Size: {}\n"
                  "\tSparsity: {}".format(u+1, i+1, len(data), len(data)/((u+1)*(i+1))))
        return data, [u+1, i+1]

    def _getTrainDict(self):
        dataDict = {}
        for i in self.trainList:
            dataDict[(i[0], i[1])] = i[2]
        return dataDict

    def _getTrainMatrix(self):
        train_matrix = np.zeros([self.shape[0], self.shape[1]], dtype=np.float32)
        for i in self.trainList:
            user = i[0]
            movie = i[1]
            rating = i[2]
            train_matrix[user][movie] = rating
        return np.array(train_matrix)

    def _getInstances(self, negNum):
        trainset = []
        for i in self.trainList:
            #trainset.append([i[0],i[1],i[2]])
            trainset.append([i[0],i[1],1.0])
            for t in range(negNum):
                j = np.random.randint(self.shape[1])
                while (i[0], j) in self.trainDict:
                    j = np.random.randint(self.shape[1])
                trainset.append([i[0],j,0.0])
        print ('The length of Trainset: %d'%(len(trainset)))
        return trainset

    def _getTest(self):
        #loading data
        testset = []
        filePath = '/data/fjsdata/ResData2019/BayesianMF/ml-1m.test.negative'
        with open(filePath, 'r') as fd:
            line = fd.readline()
            while line != None and line != '':
                arr = line.split('\t')
                u = eval(arr[0])[0]
                testset.append([u, eval(arr[0])[1], 1.0])#first is one postive item
                for i in arr[1:]:
                    testset.append([u, int(i), 0.0]) #99 negative items
                line = fd.readline()
        print ('The length of Testset: %d'%(len(testset)))
        return testset

In [24]:
class BayesianMatrixFactorization():
    """
    Bayesian Matrix Factorization model
    R = PxQ
    p ~ N(p|0, alpha^(-1)I)
    q ~ N(q|0, alpha^(-1)I)
    r = p @ q
    t ~ N(r|p @ q, beta^(-1))
    """

    def __init__(self, alpha_p:float=1., alpha_q:float=1., beta:float=1.):
        """
        ----------
        n_u, n_i: the number of users and items, respectively.
        k : the number of latent factors
        """
        self.alpha_p = alpha_p
        self.alpha_q = alpha_q
        self.beta = beta
        #posterior of p,q 
        self.pos_mean_p = None
        self.pos_precision_p = None
        self.pos_mean_q = None
        self.pos_precision_q = None

    def fit(self, R:np.ndarray, k:int=5):
        """
        bayesian update of parameters given training dataset
        Parameters
        ----------
        R : (u,i) np.ndarray
            training data independent variable, u is the number of users, i is the number of items.
        k : int, the number of latent factors.
        """
        #1. generate matrices P, Q
        P = np.random.normal(0,self.alpha_p,(R.shape[0],k))#uxk
        Q = np.random.normal(0,self.alpha_q,(R.shape[1],k))#ixk
        #2.calculate the posterior with analytical solution
        self.pos_precision_p = self.alpha_p + self.beta * Q @ Q.T # ixi
        self.pos_mean_p = self.beta * R @ np.linalg.inv(self.pos_precision_p) @ Q # uxi,ixi,ixk -> uxk
        self.pos_precision_q = self.alpha_q + self.beta * P @ P.T # uxu
        self.pos_mean_q = self.beta * R.T @ np.linalg.inv(self.pos_precision_q) @ P # ixu,uxu,uxk -> ixk

    def predict(self, sample_size:int=None):
        """
        return mean  of predictive distribution
        Parameters
        ----------
        sample_size : int, optional
            number of samples to draw from the predictive distribution
            (the default is None, no sampling from the distribution)
        Returns
        -------
        R_pred : (u,i) np.ndarray
            mean of the predictive distribution
        R_pred_sample : (u,i,sample_size) np.ndarray
            samples from the predictive distribution
        """
        if sample_size is not None:
            R_sample = []
            for i in range(sample_size):
                p_sample, q_sample = [], []
                for k in range(self.pos_mean_p.shape[1]):#latent factors    
                    mean_p = self.pos_mean_p[:,k]
                    mean_q = self.pos_mean_q[:,k]
                    p_sample_k = np.random.multivariate_normal(mean_p, np.linalg.inv(self.pos_precision_q), size=1)
                    q_sample_k = np.random.multivariate_normal(mean_q, np.linalg.inv(self.pos_precision_p), size=1)
                    p_sample.append(p_sample_k.flatten())
                    q_sample.append(q_sample_k.flatten())
                R_sample.append(np.dot(np.array(p_sample).T, np.array(q_sample)))
            return  R_sample #uxi
        
        R_pred = self.pos_mean_p @ self.pos_mean_q.T #R = PxQ
        return R_pred #uxi
    
class NeuralMatrixFactorization(nn.Module):
    def __init__(self, input_dim_u, input_dim_i, factors_dim_k, num_units=[512]):
        super(NeuralMatrixFactorization, self).__init__()
        
        self.input_dim_u = input_dim_u #user vector
        self.input_dim_i = input_dim_i #item vector
        self.factors_dim_k = factors_dim_k #latent factors vector
        
        # network with three hidden and k output layer
        self.layer1_u = nn.Linear(input_dim_u, num_units[0])
        self.layer2_u = nn.Linear(num_units[0], factors_dim_k)
        
        self.layer1_i = nn.Linear(input_dim_i, num_units[0])
        self.layer2_i = nn.Linear(num_units[0], factors_dim_k)
        
        # activation to be used between hidden layers
        self.activation = nn.ReLU(inplace = True)
    
    def forward(self, x_u, x_i):
    
        x_u = x_u.view(-1, self.input_dim_u)
        x_i = x_i.view(-1, self.input_dim_i)
        #layer1
        x_u = self.layer1_u(x_u)
        x_u = self.activation(x_u)
        x_i = self.layer1_i(x_i)
        x_i = self.activation(x_i)
        #layer2
        x_u = self.layer2_u(x_u)
        x_u = self.activation(x_u)
        x_i = self.layer2_i(x_i)
        x_i = self.activation(x_i)
        
        output = torch.sum(torch.mul(x_u,x_i),1)#pxq
        #output = torch.sigmoid(output)
        return output
    
class NMF_Model_Wrapper:
    def __init__(self, network, learn_rate=1e-2):
        
        self.learn_rate = learn_rate
        self.network = network
        self.network.cuda()
        
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr = self.learn_rate)
        self.loss_func = nn.BCELoss()#nn.BCEWithLogitsLoss()#nn.MSELoss()
    
    def fit(self, x_u, x_i, y):
        x_u = torch.from_numpy(np.array(x_u)).type(torch.FloatTensor).cuda()
        x_i = torch.from_numpy(np.array(x_i)).type(torch.FloatTensor).cuda()
        y = torch.from_numpy(np.array(y)).type(torch.FloatTensor).cuda()
        # reset gradient and total loss
        self.optimizer.zero_grad()
        output = self.network(x_u, x_i)
        output = torch.sigmoid(output)
        fit_loss = self.loss_func(output, y)
        
        fit_loss.backward()
        self.optimizer.step()

        return fit_loss

In [25]:
#dataset
ds_1m = DataSet_1M(negNum=1)
trainset = ds_1m.trainset
trainMat = ds_1m.trainMat
testset = ds_1m.testset
shape = ds_1m.shape
#training
sample_size = 1 # samples 
num_epochs = 1
batchSize = 2000
for k in [5,10,15,20]:  
    #get approximate matrice
    bmf = BayesianMatrixFactorization()
    bmf.fit(R=trainMat, k=k)
    R_sample = bmf.predict(sample_size=sample_size)
    #NMF training
    torch.cuda.empty_cache()
    torch.cuda.synchronize()     
    net = NMF_Model_Wrapper(network=NeuralMatrixFactorization(input_dim_u=shape[1], input_dim_i = shape[0],factors_dim_k=k))
    best_net, best_loss = None, float('inf')
    for iSpl, iR in enumerate(R_sample):
        shuffled_idx = np.random.permutation(np.arange(len(trainset)))
        trainset = np.array(trainset)[shuffled_idx].tolist()
        num_batches = len(trainset) // batchSize + 1 
        total_loss = []
        for i in range(num_batches):#batch
            min_idx = i * batchSize
            max_idx = np.min([len(trainset), (i+1)*batchSize])
            train_batch = trainset[min_idx: max_idx]
            x_u, x_i, y = [], [], []
            for uu,ii,rr in train_batch:
                x_u.append(iR[int(uu),:])
                x_i.append(iR[:,int(ii)])
                y.append(float(rr))
            _loss = net.fit(np.array(x_u), np.array(x_i), np.array(y))
            sys.stdout.write('\r {} / {} : loss = {}'.format(i, num_batches, float('%0.6f'%_loss.item())))
            sys.stdout.flush()
            total_loss.append(_loss.item())
        print("Sample: %5d total_loss = %.6f" % (iSpl + 1, np.mean(total_loss)))
        if np.mean(total_loss) < best_loss:
            best_loss = np.mean(total_loss)
            best_net = copy.deepcopy(net.network)
    #torch.save(best_net, "/data/tmpexec/BDMF_torch")
    #best_net = torch.load("/data/tmpexec/BDMF_torch").eval()
    #best_net = torch.load("/data/tmpexec/BDMF_torch").to('cuda:0')
    hits = []
    ndcgs = []
    for c in range(0,shape[0]):#6040
        scorelist = []
        gtItem = -1
        x_u, x_i, y_i = [], [], []
        for uu,ii,rr in testset[c*100:(c+1)*100]:#604000
            if rr == 1.0: 
                gtItem = ii
            x_u.append(np.array(trainMat[int(uu),:]))
            x_i.append(np.array(trainMat[:,int(ii)]))
            y_i.append(ii)
        x_u = torch.from_numpy(np.array(x_u)).type(torch.FloatTensor).cuda()
        x_i = torch.from_numpy(np.array(x_i)).type(torch.FloatTensor).cuda()
        output = best_net(x_u, x_i)
        output = output.cpu().data.numpy().tolist()
        for j in range(len(y_i)):
            scorelist.append([y_i[j],output[j]])
        map_item_score = {}
        for item, rate in scorelist: #turn dict
            map_item_score[item] = rate
        ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#topn=10
        hr = getHitRatio(ranklist, gtItem)
        hits.append(hr)
        ndcg = getNDCG(ranklist, gtItem)
        ndcgs.append(ndcg)
    hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
    print ("HR@%d=%.6f, NDCG@%d=%.6f" % (k, hitratio, k, ndcg))

Loading Success!
Data Info:
	User Num: 6040
	Item Num: 3706
	Data Size: 994169
	Sparsity: 0.04441379291858915
The length of Trainset: 1988338
The length of Testset: 604000




 994 / 995 : loss = 12.135984Sample:     1 total_loss = 11.701012
HR@5=0.314901, NDCG@5=0.164339
 972 / 995 : loss = 13.375147HR@10=0.385596, NDCG@10=0.201704
 994 / 995 : loss = 12.922428Sample:     1 total_loss = 13.643918
HR@15=0.349834, NDCG@15=0.183995
 994 / 995 : loss = 13.423174Sample:     1 total_loss = 13.715060
HR@20=0.370033, NDCG@20=0.194803
