In [None]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.08.10
@function: BMFBias(Bayesian Matrix Factorization with bias) 
           Datatset: KnowledgeBase-CC
           Evaluation: RMSE
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self):
        self.trainset, self.testset, self.maxu, self.maxi, self.maxr = self._getDataset_as_list()
        
    def _getDataset_as_list(self):
        #trainset
        filePath = "/data/fjsdata/BMF/kbcc_trainset.csv" 
        data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
        maxu, maxi, maxr = data['csr'].max()+1, data['ke'].max()+1, data['num'].max()
        print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
        trainset = data.values.tolist()
        #testset
        filePath = "/data/fjsdata/BMF/kbcc_testset.csv" 
        data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
        testset = data.values.tolist()
        return trainset, testset, maxu, maxi, maxr 
    
    def list_to_matrix(self, dataset, maxu, maxi):              
        dataMat = np.zeros([maxu, maxi], dtype=np.float32)
        for u,i,r in dataset:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
    def list_to_dict(self, dataset):
        dataDict = {}
        for u,i,r in dataset:
            dataDict[int(u), int(i)] = float(r)
        return dataDict
    
    def getInstances(self, dataset):
        user = []
        item = []
        rate = []
        for u, i, r in dataset:
            user.append(int(u))
            item.append(int(i))
            rate.append(float(r))
        return np.array(user), np.array(item), np.array(rate)
    
class BMF():
    def __init__(self, ds):
        self.maxr = ds.maxr
        self.maxu = ds.maxu
        self.maxi = ds.maxi
        #get the trainset and testset
        train_u, train_i, train_r = ds.getInstances(ds.trainset)
        shuffled_idx = np.random.permutation(np.arange(len(train_u)))
        self.train_u = train_u[shuffled_idx]
        self.train_i = train_i[shuffled_idx]
        self.train_r = train_r[shuffled_idx]
        assert(len(self.train_u) == len(self.train_i) and len(self.train_i) == len(self.train_r)) 
        self.test_u, self.test_i, self.test_r = ds.getInstances(ds.testset)
        assert(len(self.test_u) == len(self.test_i) and len(self.test_i) == len(self.test_r))
    
    def train_BMF(self, K=8):
        self.x_u = theano.shared(self.train_u)
        self.x_i = theano.shared(self.train_i)
        self.y_r = theano.shared(self.train_r)
        with pm.Model() as self.bmf:#bulid probabilistic model
            meanr = self.maxr/2
            # Creating the model
            P = pm.Normal('P', mu=0, sd=meanr, shape=(self.maxu, K))
            Q = pm.Normal('Q', mu=0, sd=meanr, shape=(self.maxi, K))
            #bias
            b_u = pm.Normal('b_u', mu=0, sd=meanr, shape=self.maxu)
            b_i = pm.Normal('b_i', mu=0, sd=meanr, shape=self.maxi)
            u = pm.Normal('u', mu=0, sd=meanr)
            tY = pm.Deterministic('tY', tt.add(tt.add(tt.add(b_u[self.x_u],b_i[self.x_i]),tt.sum(P[self.x_u,:]*Q[self.x_i,:], axis=1)),u))
            #likelihood function
            Y = pm.Normal('Y', mu=tY, sd=meanr, observed=self.y_r)
            
        with self.bmf: #train the probabilistic model by Bayesian inference
            tstart = time.time()
            logging.info('Start training BMF')
            approx = pm.fit(n=10000, method=pm.ADVI())
            trace = approx.sample(draws=500)
            elapsed = time.time() - tstart 
            logging.info('Complete BMF training in %d seconds' % int(elapsed))
        return trace
            
    def eval_BMF(self, trace):
        self.x_u.set_value(self.test_u)
        self.x_i.set_value(self.test_i)
        self.y_r.set_value(self.test_r)
        with self.bmf:
            ppc = pm.sample_posterior_predictive(trace, progressbar=True)
            pY = ppc['Y'].mean(axis=0)
        assert(pY.shape[0]==self.test_r.shape[0])
        squaredError = []
        for i in range(pY.shape[0]):
            error=self.test_r[i] - pY[i]
            squaredError.append(error * error)
        rmse =math.sqrt(sum(squaredError) / len(squaredError))
        return rmse
    
if __name__ == "__main__":
    ds = DataSet()#loading dataset
    bmf = BMF(ds)#negative sample ratio
    for K in [16, 32, 64]:#8, 
        trace = bmf.train_BMF(K)
        rmse = bmf.eval_BMF(trace)
        print("RMSE@{}:{}".format(K, rmse))



Dataset Statistics: Interaction = 2313189, User = 10216, Item = 96324, Sparsity = 0.0024


Average Loss = 7.0614e+06: 100%|██████████| 10000/10000 [4:38:47<00:00,  1.60s/it] 
Finished [100%]: Average Loss = 7.0607e+06
I0811 04:27:39.656300 140377736296192 inference.py:248] Finished [100%]: Average Loss = 7.0607e+06
100%|██████████| 500/500 [00:05<00:00, 95.35it/s]


RMSE@16:1.3327729249836142


Average Loss = 8.0402e+06: 100%|██████████| 10000/10000 [9:31:11<00:00,  3.36s/it]  
Finished [100%]: Average Loss = 8.0388e+06
I0811 14:03:40.021590 140377736296192 inference.py:248] Finished [100%]: Average Loss = 8.0388e+06
100%|██████████| 500/500 [00:06<00:00, 77.45it/s]


RMSE@32:1.3463370430432309


Average Loss = 1.4368e+07:  29%|██▊       | 2861/10000 [9:13:00<14:24:07,  7.26s/it] 

In [1]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.08.01
@function: BMFBias(Bayesian Matrix Factorization with bias) 
           Datatset: MovieLens-1m:https://grouplens.org/datasets/movielens/  
           Evaluation: RMSE
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self):
        self.trainset, self.testset, self.maxu, self.maxi, self.maxr = self._getDataset_as_list()
        
    def _getDataset_as_list(self):
        #trainset
        filePath = "/data/fjsdata/BMF/ml-1m.train.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        maxu, maxi, maxr = data['user'].max()+1, data['item'].max()+1, data['rating'].max()
        print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
        trainset = data.values.tolist()
        #testset
        filePath = "/data/fjsdata/BMF/ml-1m.test.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        testset = data.values.tolist()
        return trainset, testset, maxu, maxi, maxr 
    
    def list_to_matrix(self, dataset, maxu, maxi):              
        dataMat = np.zeros([maxu, maxi], dtype=np.float32)
        for u,i,r in dataset:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
    def list_to_dict(self, dataset):
        dataDict = {}
        for u,i,r in dataset:
            dataDict[int(u), int(i)] = float(r)
        return dataDict
    
    def getInstances(self, dataset):
        user = []
        item = []
        rate = []
        for u, i, r in dataset:
            user.append(int(u))
            item.append(int(i))
            rate.append(float(r))
        return np.array(user), np.array(item), np.array(rate)
    
class BMF():
    def __init__(self, ds):
        self.maxr = ds.maxr
        self.maxu = ds.maxu
        self.maxi = ds.maxi
        #get the trainset and testset
        train_u, train_i, train_r = ds.getInstances(ds.trainset)
        shuffled_idx = np.random.permutation(np.arange(len(train_u)))
        self.train_u = train_u[shuffled_idx]
        self.train_i = train_i[shuffled_idx]
        self.train_r = train_r[shuffled_idx]
        assert(len(self.train_u) == len(self.train_i) and len(self.train_i) == len(self.train_r)) 
        self.test_u, self.test_i, self.test_r = ds.getInstances(ds.testset)
        assert(len(self.test_u) == len(self.test_i) and len(self.test_i) == len(self.test_r))
    
    def train_BMF(self, K=8):
        self.x_u = theano.shared(self.train_u)
        self.x_i = theano.shared(self.train_i)
        self.y_r = theano.shared(self.train_r)
        with pm.Model() as self.bmf:#bulid probabilistic model
            meanr = self.maxr/2
            # Creating the model
            P = pm.Normal('P', mu=0, sd=meanr, shape=(self.maxu, K))
            Q = pm.Normal('Q', mu=0, sd=meanr, shape=(self.maxi, K))
            #bias
            b_u = pm.Normal('b_u', mu=0, sd=meanr, shape=self.maxu)
            b_i = pm.Normal('b_i', mu=0, sd=meanr, shape=self.maxi)
            u = pm.Normal('u', mu=0, sd=meanr)
            tY = pm.Deterministic('tY', tt.add(tt.add(tt.add(b_u[self.x_u],b_i[self.x_i]),tt.sum(P[self.x_u,:]*Q[self.x_i,:], axis=1)),u))
            #likelihood function
            Y = pm.Normal('Y', mu=tY, sd=meanr, observed=self.y_r)
            
        with self.bmf: #train the probabilistic model by Bayesian inference
            tstart = time.time()
            logging.info('Start training BMF')
            approx = pm.fit(n=20000, method=pm.ADVI())
            trace = approx.sample(draws=2000)
            elapsed = time.time() - tstart 
            logging.info('Complete BMF training in %d seconds' % int(elapsed))
        return trace
            
    def eval_BMF(self, trace):
        self.x_u.set_value(self.test_u)
        self.x_i.set_value(self.test_i)
        self.y_r.set_value(self.test_r)
        with self.bmf:
            ppc = pm.sample_posterior_predictive(trace, progressbar=True)
            pY = ppc['Y'].mean(axis=0)
        assert(pY.shape[0]==self.test_r.shape[0])
        squaredError = []
        for i in range(pY.shape[0]):
            error=self.test_r[i] - pY[i]
            squaredError.append(error * error)
        rmse =math.sqrt(sum(squaredError) / len(squaredError))
        return rmse
    
if __name__ == "__main__":
    ds = DataSet()#loading dataset
    bmf = BMF(ds)#negative sample ratio
    for K in [8, 16, 32]:#64
        trace = bmf.train_BMF(K)
        rmse = bmf.eval_BMF(trace)
        print("RMSE@{}:{}".format(K, rmse))



Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444


Average Loss = 2.5768e+06: 100%|██████████| 20000/20000 [12:39:36<00:00,  2.08s/it]  
Finished [100%]: Average Loss = 2.5767e+06
I0803 13:33:45.902980 140106723780352 inference.py:248] Finished [100%]: Average Loss = 2.5767e+06
100%|██████████| 2000/2000 [00:33<00:00, 59.35it/s]

RMSE@64:0.9552209868260392



