In [None]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.29
@function: BNMF(Bayesian Neural Matrix Factorization) 
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
'''
import sys
import time
import heapq
import math
import gc
import numpy as np
import pandas as pd

import pymc3 as pm
import theano
import tensorflow as tf
import theano.tensor as tt

class DataSet:
    def __init__(self, fileName, negNum):
        self.negNum = negNum #negative sample ratio
        self.trainList, self.maxu, self.maxi = self.getTrainset_as_list(fileName)
        self.testList = self.getTestset_as_list(fileName)
        
    def getTrainset_as_list(self, fileName):
        if (fileName == 'ml-1m') or (fileName == 'pinterest-20'):
            filePath = "/data/fjsdata/ctKngBase/ml/"+fileName+".train.rating" 
            data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
            data['rating']=data['rating'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
        if (fileName == 'kb-cc'):
            filePath = "/data/fjsdata/ctKngBase/kbcc_trainset.csv"
            data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
            data['num']=data['num'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
    
    def getTestset_as_list(self, fileName):
        if (fileName == 'ml-1m') or (fileName == 'pinterest-20'):
            filePath = "/data/fjsdata/ctKngBase/ml/"+fileName+".test.negative" 
            dataList = []
            with open(filePath, 'r') as fd:
                line = fd.readline()
                while line != None and line != '':
                    arr = line.split('\t')
                    u = eval(arr[0])[0]
                    dataList.append([u, eval(arr[0])[1], 1.0])#first is one postive item
                    for i in arr[1:]:
                        dataList.append([u, int(i), 0.0]) #99 negative items
                    line = fd.readline()
            return dataList
        if (fileName == 'kb-cc'):
            filePath = "/data/fjsdata/ctKngBase/kbcc_testset.csv"
            data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
            data['num']=data['num'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            dataList = data.values.tolist()
            return dataList
        
    def list_to_matrix(self):              
        dataMat = np.zeros([self.maxu, self.maxi], dtype=np.float32)
        for u,i,r in self.trainList:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
    def list_to_dict(self):
        dataDict = {}
        for u,i,r in self.trainList:
            dataDict[int(u), int(i)] = float(r)
        return dataDict
    
    def getInstances(self, isTest=False):
        user = []
        item = []
        rate = []
        if isTest==True: #test
            for u, i, r in self.testList:
                user.append(int(u))
                item.append(int(i))
                rate.append(float(r))
        else:#train
            for u, i, r in self.trainList:
                user.append(int(u))
                item.append(int(i))
                rate.append(float(r))
            #negative samples
            dataDict = self.list_to_dict()
            for j in range(len(self.trainList)*self.negNum):
                u = np.random.randint(self.maxu)
                i = np.random.randint(self.maxi)
                while (u, i) in dataDict:
                    u = np.random.randint(self.maxu)
                    i = np.random.randint(self.maxi)
                user.append(int(u))
                item.append(int(i))
                rate.append(float(0.0)) 
        return np.array(user), np.array(item), np.array(rate)
    
class BNMF:
    def __init__(self, dataset, K=8):
        self.shape = [dataset.maxu, dataset.maxi]
        #get the trainset and testset
        self.train_u, self.train_i, self.train_r = dataset.getInstances(isTest=False)
        assert(len(self.train_u) == len(self.train_i) and len(self.train_i) == len(self.train_r))
        shuffled_idx = np.random.permutation(np.arange(len(self.train_u)))
        self.train_u = self.train_u[shuffled_idx]
        self.train_i = self.train_i[shuffled_idx]
        self.train_r = self.train_r[shuffled_idx]
        self.test_u, self.test_i, self.test_r = dataset.getInstances(isTest=True)
        assert(len(self.test_u) == len(self.test_i) and len(self.test_i) == len(self.test_r))
        
        #initialize
        #K is number of latent factors
        self.userLayer = [512, K]
        self.itemLayer = [512, K]
        self.batchSize = 1024
        self.lr = 0.001
        tf.reset_default_graph()
        self._build_MLP()
        self._init_sess()
        
    def _init_sess(self):
        #define seesion
        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.config.allow_soft_placement = True
        self.sess = tf.Session(config=self.config)
        self.sess.run(tf.global_variables_initializer(), feed_dict={self.R: dataset.list_to_matrix()})
        
    def _build_MLP(self):
        print('start building the Multi-layer non-linear projection')
        # add placeholder
        self.user = tf.placeholder(tf.int32)
        self.item = tf.placeholder(tf.int32)
        self.rate = tf.placeholder(tf.float32)
        self.R = tf.placeholder(tf.float32, shape=(self.shape[0], self.shape[1]))
        user_item_embedding = tf.convert_to_tensor(tf.Variable(self.R))
        item_user_embedding = tf.transpose(user_item_embedding)
        user_input = tf.nn.embedding_lookup(user_item_embedding, self.user)
        item_input = tf.nn.embedding_lookup(item_user_embedding, self.item)
        
        def init_variable(shape, name):
            return tf.Variable(tf.truncated_normal(shape=shape, dtype=tf.float32, stddev=0.01), name=name)

        with tf.name_scope("User_Layer"):
            user_W1 = init_variable([self.shape[1], self.userLayer[0]], "user_W1")
            user_out = tf.matmul(user_input, user_W1)
            for i in range(0, len(self.userLayer)-1):
                W = init_variable([self.userLayer[i], self.userLayer[i+1]], "user_W"+str(i+2))
                b = init_variable([self.userLayer[i+1]], "user_b"+str(i+2))
                user_out = tf.nn.relu(tf.add(tf.matmul(user_out, W), b))

        with tf.name_scope("Item_Layer"):
            item_W1 = init_variable([self.shape[0], self.itemLayer[0]], "item_W1")
            item_out = tf.matmul(item_input, item_W1)
            for i in range(0, len(self.itemLayer)-1):
                W = init_variable([self.itemLayer[i], self.itemLayer[i+1]], "item_W"+str(i+2))
                b = init_variable([self.itemLayer[i+1]], "item_b"+str(i+2))
                item_out = tf.nn.relu(tf.add(tf.matmul(item_out, W), b))
           
        self.r_ui = tf.reduce_sum(tf.multiply(user_out, item_out), axis=1, keepdims=False)
        self.loss = tf.reduce_sum(tf.losses.mean_squared_error(labels = self.rate, predictions=self.r_ui))
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_step = optimizer.minimize(self.loss)
        print('done building the the Multi-layer non-linear projection')
        
    def _build_BPF(self):
        print('start building the Bayesian probabilistic model')  
        self.x_u = theano.shared(self.train_u)
        self.x_i = theano.shared(self.train_i)
        self.y_r = theano.shared(self.train_r)
        self.y_r_ui = theano.shared(np.array(self.nn_r_ui))
        assert(len(self.y_r.get_value())==len(self.y_r_ui.get_value()))
        with pm.Model() as self.bncf:#define the prior and likelihood
            b_u = pm.Normal('b_u', 0, sd=1, shape=self.shape[0])
            b_i = pm.Normal('b_i', 0, sd=1, shape=self.shape[1])
            u = pm.Normal('u', 0, sd=1)
            tY = pm.Deterministic('tY', tt.add(tt.add(tt.add(b_u[self.x_u],b_i[self.x_i]),self.y_r_ui),u))
            #tY = pm.Deterministic('tY', ((b_u[self.x_u]+b_i[self.x_i])+self.y_r_ui)+u)#b_u+b_i+u+nn_r_ui
            nY = pm.Deterministic('nY', pm.math.sigmoid(tY))
            # likelihood of observed data
            Y = pm.Bernoulli('Y', nY, observed=self.y_r)#total_size=self.y_r.get_value().shape[0]
        with self.bncf:#inference
            approx = pm.fit(n=1000, method=pm.ADVI())
            self.trace = approx.sample(draws=500)
        with self.bncf: #posterior prediction
            ppc = pm.sample_posterior_predictive(self.trace, progressbar=True)
            self.by_r_ui = ppc['Y'].mean(axis=0)
        print('done building the Bayesian probabilistic model')
        
    def train_BNMF(self, verbose=10):       
        print('start training the BNCF model')
        tstart = time.time()
        
        num_batches = len(self.train_u) // self.batchSize + 1
        #1.traing r_ui in neural network 
        self.nn_r_ui=[]
        for i in range(num_batches):
            min_idx = i * self.batchSize
            max_idx = np.min([len(self.train_u), (i+1)*self.batchSize])
            train_u_batch = self.train_u[min_idx: max_idx]
            train_i_batch = self.train_i[min_idx: max_idx]
            #train_r_batch = self.train_r[min_idx: max_idx]
            pre_r_ui_batch = self.sess.run(self.r_ui, feed_dict={self.user: train_u_batch, \
                                                                 self.item: train_i_batch})
            self.nn_r_ui.extend(pre_r_ui_batch)
            if verbose and i % verbose == 0:
                sys.stdout.write('\r{} / {} : shape = {} '.format(i, num_batches, len(self.nn_r_ui)))
                sys.stdout.flush()
        #2.training bias in Bayesian inference
        self._build_BPF()
        #3.training self.loss in neural network
        losses = []
        for i in range(num_batches):
            min_idx = i * self.batchSize
            max_idx = np.min([len(self.train_u), (i+1)*self.batchSize])
            train_u_batch = self.train_u[min_idx: max_idx]
            train_i_batch = self.train_i[min_idx: max_idx]
            train_r_batch = self.by_r_ui[min_idx: max_idx]
            _, tmp_loss = self.sess.run([self.train_step, self.loss], feed_dict={self.user: train_u_batch, \
                                                                                 self.item: train_i_batch, \
                                                                                 self.rate: train_r_batch})
            losses.append(tmp_loss)
            if verbose and i % verbose == 0:
                sys.stdout.write('\r{} / {} : loss = {}'.format(i, num_batches, np.mean(losses[-verbose:])))
                sys.stdout.flush()
        loss = np.mean(losses)  
        elapsed = time.time() - tstart    
        print('Completed training the BNCF model in %d seconds' % int(elapsed))
        return loss
           
    def eval_BNMF(self, verbose=10):
        def getHitRatio(ranklist, targetItem):
            for item in ranklist:
                if item == targetItem:
                    return 1
            return 0
    
        def getNDCG(ranklist, targetItem):
            for i in range(len(ranklist)):
                item = ranklist[i]
                if item == targetItem:
                    return math.log(2) / math.log(i+2)
            return 0
    
        #1.get r_ui in neural network
        num_batches = len(self.test_u) // self.batchSize + 1
        #1.traing r_ui in neural network 
        self.nn_r_ui=[]
        for i in range(num_batches):
            min_idx = i * self.batchSize
            max_idx = np.min([len(self.test_u), (i+1)*self.batchSize])
            test_u_batch = self.test_u[min_idx: max_idx]
            test_i_batch = self.test_i[min_idx: max_idx]
            pre_r_ui_batch = self.sess.run(self.r_ui, feed_dict={self.user: test_u_batch, \
                                                                 self.item: test_i_batch})
            self.nn_r_ui.extend(pre_r_ui_batch)
            if verbose and i % verbose == 0:
                sys.stdout.write('\r{} / {} : shape = {} '.format(i, num_batches, len(self.nn_r_ui)))
                sys.stdout.flush()
                
        #2. get biais in Bayesian inference
        self.x_u.set_value(self.test_u)
        self.x_i.set_value(self.test_i)
        self.y_r.set_value(self.test_r)
        self.y_r_ui.set_value(self.nn_r_ui)
        with self.bncf:#evaluation
            ppc = pm.sample_posterior_predictive(self.trace, progressbar=True) 
            pre_r = ppc['Y'].mean(axis=0)
        assert(pre_r.shape[0]==self.test_i.shape[0])
        #every user have one positive item and 99 negative items
        num_batches = len(self.test_r) // 100
        hits = []
        ndcgs = []
        for i in range(num_batches):
            test_i_batch = self.test_i[i*100: (i+1)*100]
            pre_r_batch = pre_r[i*100: (i+1)*100]
            map_item_score = {}
            for j in range(100):
                map_item_score[test_i_batch[j]] = pre_r_batch[j]
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hits.append(getHitRatio(ranklist, test_i_batch[0]))
            ndcgs.append(getNDCG(ranklist, test_i_batch[0]))
        hit, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hit, ndcg
    
if __name__ == "__main__":
    for fileName in ['pinterest-20', 'kb-cc']:#['ml-1m', 'pinterest-20', 'kb-cc']:
        dataset = DataSet(fileName=fileName, negNum=4)#loading dataset
        for K in [8, 16, 32, 64]:
            model = BNMF(dataset, K)
            best_hr = 0.0
            best_ndcg = 0.0
            for epoch in range(2):
                loss = model.train_BNMF()
                print("\nMean loss in this epoch is: {}".format(loss))
                hit, ndcg = model.eval_BNMF()
                print("HR@10: {}, NDCG@10: {}, At K {} and Dataset {}".format(hit, ndcg, K, fileName ))
                if hit>best_hr: best_hr=hit
                if ndcg>best_ndcg: best_ndcg=ndcg
            print("Best HR@10: {}, Best NDCG@10: {}, At K {} and Dataset {}".format(best_hr, best_ndcg, K, fileName ))

Dataset Statistics: Interaction = 1445622, User = 55187, Item = 9916, Sparsity = 0.0026
start building the Multi-layer non-linear projection


  num_elements)


done building the the Multi-layer non-linear projection
start training the BNCF model
7050 / 7059 : shape = 7220224 start building the Bayesian probabilistic model


Average Loss = 5.3427e+06: 100%|██████████| 1000/1000 [16:50<00:00,  1.01s/it]
Finished [100%]: Average Loss = 5.3331e+06
I0729 12:23:47.493609 140210647013120 inference.py:248] Finished [100%]: Average Loss = 5.3331e+06
100%|██████████| 500/500 [21:07<00:00,  3.17s/it]


done building the Bayesian probabilistic model
7050 / 7059 : loss = 0.1283096522092819228Completed training the BNCF model in 67204 seconds

Mean loss in this epoch is: 5.747372627258301
5380 / 5390 : shape = 5510144 

100%|██████████| 500/500 [13:30<00:00,  1.72s/it]


HR@10: 0.2364143729501513, NDCG@10: 0.12314803893484755, At K 8 and Dataset pinterest-20
start training the BNCF model
7050 / 7059 : shape = 7220224 start building the Bayesian probabilistic model


Average Loss = 5.3131e+06: 100%|██████████| 1000/1000 [20:00<00:00,  1.21s/it]
Finished [100%]: Average Loss = 5.3073e+06
I0730 07:34:00.587919 140210647013120 inference.py:248] Finished [100%]: Average Loss = 5.3073e+06
100%|██████████| 500/500 [17:11<00:00,  2.69s/it]


done building the Bayesian probabilistic model
3790 / 7059 : loss = 0.13391141593456268

In [None]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.29
@function: BNMF(Bayesian Neural Matrix Factorization) 
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
'''
import sys
import time
import heapq
import math
import gc
import numpy as np
import pandas as pd

import pymc3 as pm
import theano
import tensorflow as tf
import theano.tensor as tt

class DataSet:
    def __init__(self, fileName, negNum):
        self.negNum = negNum #negative sample ratio
        self.trainList, self.maxu, self.maxi = self.getTrainset_as_list(fileName)
        self.testList = self.getTestset_as_list(fileName)
        
    def getTrainset_as_list(self, fileName):
        if (fileName == 'ml-1m') or (fileName == 'pinterest-20'):
            filePath = "/data/fjsdata/ctKngBase/ml/"+fileName+".train.rating" 
            data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
            data['rating']=data['rating'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
        if (fileName == 'kb-cc'):
            filePath = "/data/fjsdata/ctKngBase/kbcc_trainset.csv"
            data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
            data['num']=data['num'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
    
    def getTestset_as_list(self, fileName):
        if (fileName == 'ml-1m') or (fileName == 'pinterest-20'):
            filePath = "/data/fjsdata/ctKngBase/ml/"+fileName+".test.negative" 
            dataList = []
            with open(filePath, 'r') as fd:
                line = fd.readline()
                while line != None and line != '':
                    arr = line.split('\t')
                    u = eval(arr[0])[0]
                    dataList.append([u, eval(arr[0])[1], 1.0])#first is one postive item
                    for i in arr[1:]:
                        dataList.append([u, int(i), 0.0]) #99 negative items
                    line = fd.readline()
            return dataList
        if (fileName == 'kb-cc'):
            filePath = "/data/fjsdata/ctKngBase/kbcc_testset.csv"
            data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
            data['num']=data['num'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            dataList = data.values.tolist()
            return dataList
        
    def list_to_matrix(self):              
        dataMat = np.zeros([self.maxu, self.maxi], dtype=np.float32)
        for u,i,r in self.trainList:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
    def list_to_dict(self):
        dataDict = {}
        for u,i,r in self.trainList:
            dataDict[int(u), int(i)] = float(r)
        return dataDict
    
    def getInstances(self, isTest=False):
        user = []
        item = []
        rate = []
        if isTest==True: #test
            for u, i, r in self.testList:
                user.append(int(u))
                item.append(int(i))
                rate.append(float(r))
        else:#train
            for u, i, r in self.trainList:
                user.append(int(u))
                item.append(int(i))
                rate.append(float(r))
            #negative samples
            dataDict = self.list_to_dict()
            for j in range(len(self.trainList)*self.negNum):
                u = np.random.randint(self.maxu)
                i = np.random.randint(self.maxi)
                while (u, i) in dataDict:
                    u = np.random.randint(self.maxu)
                    i = np.random.randint(self.maxi)
                user.append(int(u))
                item.append(int(i))
                rate.append(float(0.0)) 
        return np.array(user), np.array(item), np.array(rate)
    
class BNMF:
    def __init__(self, dataset, K=8):
        self.shape = [dataset.maxu, dataset.maxi]
        #get the trainset and testset
        self.train_u, self.train_i, self.train_r = dataset.getInstances(isTest=False)
        assert(len(self.train_u) == len(self.train_i) and len(self.train_i) == len(self.train_r))
        shuffled_idx = np.random.permutation(np.arange(len(self.train_u)))
        self.train_u = self.train_u[shuffled_idx]
        self.train_i = self.train_i[shuffled_idx]
        self.train_r = self.train_r[shuffled_idx]
        self.test_u, self.test_i, self.test_r = dataset.getInstances(isTest=True)
        assert(len(self.test_u) == len(self.test_i) and len(self.test_i) == len(self.test_r))
        
        #initialize
        #K is number of latent factors
        self.userLayer = [512, K]
        self.itemLayer = [512, K]
        self.batchSize = 1024
        self.lr = 0.001
        tf.reset_default_graph()
        self._build_MLP(R = dataset.list_to_matrix())
        self._init_sess()
        
    def _init_sess(self):
        #define seesion
        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.config.allow_soft_placement = True
        self.sess = tf.Session(config=self.config)
        self.sess.run(tf.global_variables_initializer())
        
    def _build_MLP(self, R):
        print('start building the Multi-layer non-linear projection')
        # add placeholder
        self.user = tf.placeholder(tf.int32)
        self.item = tf.placeholder(tf.int32)
        self.rate = tf.placeholder(tf.float32)
        user_item_embedding = tf.convert_to_tensor(R)
        item_user_embedding = tf.transpose(user_item_embedding)
        user_input = tf.nn.embedding_lookup(user_item_embedding, self.user)
        item_input = tf.nn.embedding_lookup(item_user_embedding, self.item)
        
        def init_variable(shape, name):
            return tf.Variable(tf.truncated_normal(shape=shape, dtype=tf.float32, stddev=0.01), name=name)

        with tf.name_scope("User_Layer"):
            user_W1 = init_variable([self.shape[1], self.userLayer[0]], "user_W1")
            user_out = tf.matmul(user_input, user_W1)
            for i in range(0, len(self.userLayer)-1):
                W = init_variable([self.userLayer[i], self.userLayer[i+1]], "user_W"+str(i+2))
                b = init_variable([self.userLayer[i+1]], "user_b"+str(i+2))
                user_out = tf.nn.relu(tf.add(tf.matmul(user_out, W), b))

        with tf.name_scope("Item_Layer"):
            item_W1 = init_variable([self.shape[0], self.itemLayer[0]], "item_W1")
            item_out = tf.matmul(item_input, item_W1)
            for i in range(0, len(self.itemLayer)-1):
                W = init_variable([self.itemLayer[i], self.itemLayer[i+1]], "item_W"+str(i+2))
                b = init_variable([self.itemLayer[i+1]], "item_b"+str(i+2))
                item_out = tf.nn.relu(tf.add(tf.matmul(item_out, W), b))
           
        self.r_ui = tf.reduce_sum(tf.multiply(user_out, item_out), axis=1, keepdims=False)
        self.loss = tf.reduce_sum(tf.losses.mean_squared_error(labels = self.rate, predictions=self.r_ui))
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_step = optimizer.minimize(self.loss)
        print('done building the the Multi-layer non-linear projection')
        
    def _build_BPF(self):
        print('start building the Bayesian probabilistic model')  
        self.x_u = theano.shared(self.train_u)
        self.x_i = theano.shared(self.train_i)
        self.y_r = theano.shared(self.train_r)
        self.y_r_ui = theano.shared(np.array(self.nn_r_ui))
        assert(len(self.y_r.get_value())==len(self.y_r_ui.get_value()))
        with pm.Model() as self.bncf:#define the prior and likelihood
            b_u = pm.Normal('b_u', 0, sd=1, shape=self.shape[0])
            b_i = pm.Normal('b_i', 0, sd=1, shape=self.shape[1])
            u = pm.Normal('u', 0, sd=1)
            tY = pm.Deterministic('tY', tt.add(tt.add(tt.add(b_u[self.x_u],b_i[self.x_i]),self.y_r_ui),u))
            #tY = pm.Deterministic('tY', ((b_u[self.x_u]+b_i[self.x_i])+self.y_r_ui)+u)#b_u+b_i+u+nn_r_ui
            nY = pm.Deterministic('nY', pm.math.sigmoid(tY))
            # likelihood of observed data
            Y = pm.Bernoulli('Y', nY, observed=self.y_r)#total_size=self.y_r.get_value().shape[0]
        with self.bncf:#inference
            approx = pm.fit(n=1000, method=pm.ADVI())
            self.trace = approx.sample(draws=500)
        with self.bncf: #posterior prediction
            ppc = pm.sample_posterior_predictive(self.trace, progressbar=True)
            self.by_r_ui = ppc['Y'].mean(axis=0)
        print('done building the Bayesian probabilistic model')
        
    def train_BNMF(self, verbose=10):       
        print('start training the BNCF model')
        tstart = time.time()
        
        num_batches = len(self.train_u) // self.batchSize + 1
        #1.traing r_ui in neural network 
        self.nn_r_ui=[]
        for i in range(num_batches):
            min_idx = i * self.batchSize
            max_idx = np.min([len(self.train_u), (i+1)*self.batchSize])
            train_u_batch = self.train_u[min_idx: max_idx]
            train_i_batch = self.train_i[min_idx: max_idx]
            #train_r_batch = self.train_r[min_idx: max_idx]
            pre_r_ui_batch = self.sess.run(self.r_ui, feed_dict={self.user: train_u_batch, \
                                                                 self.item: train_i_batch})
            self.nn_r_ui.extend(pre_r_ui_batch)
            if verbose and i % verbose == 0:
                sys.stdout.write('\r{} / {} : shape = {} '.format(i, num_batches, len(self.nn_r_ui)))
                sys.stdout.flush()
        #2.training bias in Bayesian inference
        self._build_BPF()
        #3.training self.loss in neural network
        losses = []
        for i in range(num_batches):
            min_idx = i * self.batchSize
            max_idx = np.min([len(self.train_u), (i+1)*self.batchSize])
            train_u_batch = self.train_u[min_idx: max_idx]
            train_i_batch = self.train_i[min_idx: max_idx]
            train_r_batch = self.by_r_ui[min_idx: max_idx]
            _, tmp_loss = self.sess.run([self.train_step, self.loss], feed_dict={self.user: train_u_batch, \
                                                                                 self.item: train_i_batch, \
                                                                                 self.rate: train_r_batch})
            losses.append(tmp_loss)
            if verbose and i % verbose == 0:
                sys.stdout.write('\r{} / {} : loss = {}'.format(i, num_batches, np.mean(losses[-verbose:])))
                sys.stdout.flush()
        loss = np.mean(losses)  
        elapsed = time.time() - tstart    
        print('Completed training the BNCF model in %d seconds' % int(elapsed))
        return loss
           
    def eval_BNMF(self, verbose=10):
        def getHitRatio(ranklist, targetItem):
            for item in ranklist:
                if item == targetItem:
                    return 1
            return 0
    
        def getNDCG(ranklist, targetItem):
            for i in range(len(ranklist)):
                item = ranklist[i]
                if item == targetItem:
                    return math.log(2) / math.log(i+2)
            return 0
    
        #1.get r_ui in neural network
        num_batches = len(self.test_u) // self.batchSize + 1
        #1.traing r_ui in neural network 
        self.nn_r_ui=[]
        for i in range(num_batches):
            min_idx = i * self.batchSize
            max_idx = np.min([len(self.test_u), (i+1)*self.batchSize])
            test_u_batch = self.test_u[min_idx: max_idx]
            test_i_batch = self.test_i[min_idx: max_idx]
            pre_r_ui_batch = self.sess.run(self.r_ui, feed_dict={self.user: test_u_batch, \
                                                                 self.item: test_i_batch})
            self.nn_r_ui.extend(pre_r_ui_batch)
            if verbose and i % verbose == 0:
                sys.stdout.write('\r{} / {} : shape = {} '.format(i, num_batches, len(self.nn_r_ui)))
                sys.stdout.flush()
                
        #2. get biais in Bayesian inference
        self.x_u.set_value(self.test_u)
        self.x_i.set_value(self.test_i)
        self.y_r.set_value(self.test_r)
        self.y_r_ui.set_value(self.nn_r_ui)
        with self.bncf:#evaluation
            ppc = pm.sample_posterior_predictive(self.trace, progressbar=True) 
            pre_r = ppc['Y'].mean(axis=0)
        assert(pre_r.shape[0]==self.test_i.shape[0])
        #every user have one positive item and 99 negative items
        num_batches = len(self.test_r) // 100
        hits = []
        ndcgs = []
        for i in range(num_batches):
            test_i_batch = self.test_i[i*100: (i+1)*100]
            pre_r_batch = pre_r[i*100: (i+1)*100]
            map_item_score = {}
            for j in range(100):
                map_item_score[test_i_batch[j]] = pre_r_batch[j]
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hits.append(getHitRatio(ranklist, test_i_batch[0]))
            ndcgs.append(getNDCG(ranklist, test_i_batch[0]))
        hit, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hit, ndcg
    
if __name__ == "__main__":
    for fileName in ['ml-1m', 'pinterest-20', 'kb-cc']:
        dataset = DataSet(fileName=fileName, negNum=4)#loading dataset
        for K in [8, 16, 32, 64]:
            model = BNMF(dataset, K)
            best_hr = 0.0
            best_ndcg = 0.0
            for epoch in range(2):
                loss = model.train_BNMF()
                print("\nMean loss in this epoch is: {}".format(loss))
                hit, ndcg = model.eval_BNMF()
                print("HR@10: {}, NDCG@10: {}, At K {} and Dataset {}".format(hit, ndcg, K, fileName ))
                if hit>best_hr: best_hr=hit
                if ndcg>best_ndcg: best_ndcg=ndcg
            print("Best HR@10: {}, Best NDCG@10: {}, At K {} and Dataset {}".format(best_hr, best_ndcg, K, fileName ))
            
'''
Best HR@10: 0.4197019867549669, Best NDCG@10: 0.22775834012977136, At K 8 and Dataset ml-1m
Best HR@10: 0.41721854304635764, Best NDCG@10: 0.2251975547330144, At K 16 and Dataset ml-1m
Best HR@10: 0.4197019867549669, Best NDCG@10: 0.22811742858950668, At K 32 and Dataset ml-1m
Best HR@10: 0.42566225165562915, Best NDCG@10: 0.2340599456026715, At K 64 and Dataset ml-1m
'''

In [18]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.25
@function: BNMF(Bayesian Neural Matrix Factorization) 
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
'''
import sys
import time
import heapq
import math
import gc
import numpy as np
import pandas as pd

import pymc3 as pm
import theano

class DataSet:
    def __init__(self, fileName, negNum):
        self.negNum = negNum #negative sample ratio
        self.trainList, self.maxu, self.maxi = self.getTrainset_as_list(fileName)
        self.testList = self.getTestset_as_list(fileName)
        
    def getTrainset_as_list(self, fileName):
        if (fileName == 'ml-1m') or (fileName == 'pinterest-20'):
            filePath = "/data/fjsdata/ctKngBase/ml/"+fileName+".train.rating" 
            data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
            data['rating']=data['rating'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
        if (fileName == 'kb-cc'):
            filePath = "/data/fjsdata/ctKngBase/kbcc_trainset.csv"
            data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
            data['num']=data['num'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
    
    def getTestset_as_list(self, fileName):
        if (fileName == 'ml-1m') or (fileName == 'pinterest-20'):
            filePath = "/data/fjsdata/ctKngBase/ml/"+fileName+".test.negative" 
            dataList = []
            with open(filePath, 'r') as fd:
                line = fd.readline()
                while line != None and line != '':
                    arr = line.split('\t')
                    u = eval(arr[0])[0]
                    dataList.append([u, eval(arr[0])[1], 1.0])#first is one postive item
                    for i in arr[1:]:
                        dataList.append([u, int(i), 0.0]) #99 negative items
                    line = fd.readline()
            return dataList
        if (fileName == 'kb-cc'):
            filePath = "/data/fjsdata/ctKngBase/kbcc_testset.csv"
            data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
            data['num']=data['num'].apply(lambda x: 1.0 if float(x)>0.0 else 0.0)
            dataList = data.values.tolist()
            return dataList
        
    def list_to_matrix(self):              
        dataMat = np.zeros([self.maxu, self.maxi], dtype=np.float32)
        for u,i,r in self.trainList:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
    def list_to_dict(self):
        dataDict = {}
        for u,i,r in self.trainList:
            dataDict[int(u), int(i)] = float(r)
        return dataDict
    
    def getInstances(self, isTest=False):
        user = []
        item = []
        rate = []
        if isTest==True: #test
            for u, i, r in self.testList:
                user.append(int(u))
                item.append(int(i))
                rate.append(float(r))
        else:#train
            for u, i, r in self.trainList:
                user.append(int(u))
                item.append(int(i))
                rate.append(float(r))
            #negative samples
            dataDict = self.list_to_dict()
            for j in range(len(self.trainList)*self.negNum):
                u = np.random.randint(self.maxu)
                i = np.random.randint(self.maxi)
                while (u, i) in dataDict:
                    u = np.random.randint(self.maxu)
                    i = np.random.randint(self.maxi)
                user.append(int(u))
                item.append(int(i))
                rate.append(float(0.0)) 
        return user, item, rate
    
    def getHitRatio(self, ranklist, targetItem):
        for item in ranklist:
            if item == targetItem:
                return 1
        return 0
    
    def getNDCG(self, ranklist, targetItem):
        for i in range(len(ranklist)):
            item = ranklist[i]
            if item == targetItem:
                return math.log(2) / math.log(i+2)
        return 0
    
class BNCF:
    def __init__(self, dataset):
        self.maxu = dataset.maxu
        self.maxi = dataset.maxi
        self.testList = dataset.testList
        
        #get the trainset and testset
        dataMat = dataset.list_to_matrix()
        #get the test data
        test_u, test_i, test_r = dataset.getInstances(isTest=True)
        self.test_u = dataMat[test_u,:]
        self.test_i = dataMat.T[test_i,:]
        self.test_r = np.array(test_r)
        assert(len(self.test_u) == len(self.test_i) and len(self.test_i) == len(self.test_r))
        #get the training data and setting the input 
        train_u, train_i, train_r = dataset.getInstances(isTest=False)
        self.train_u = dataMat[train_u,:]
        self.train_i = dataMat.T[train_i,:]
        self.train_r = np.array(train_r)
        assert(len(self.train_u) == len(self.train_i) and len(self.train_i) == len(self.train_r)) 
        #release memory
        del dataset, dataMat, train_u, train_i, train_r, test_u, test_i, test_r
        gc.collect()
        
    def build_BNCF(self, K = 8):
        layers = [1024, K] #number of latent factors
        print('start building the BNCF model')
        
        self.x_u = theano.shared(self.train_u)#self.x_u = pm.Minibatch(self.train_u, batch_size=8096)
        self.x_i = theano.shared(self.train_i)#self.x_i = pm.Minibatch(self.train_i, batch_size=8096)
        self.y_r = theano.shared(self.train_r)#self.y_r = pm.Minibatch(self.train_r, batch_size=8096)
        #release memory
        del self.train_u, self.train_i, self.train_r
        gc.collect()
        with pm.Model() as self.bncf:
            #user layer
            user_W1 = pm.Normal('user_W1', 0, sd=1, shape=[self.maxi, layers[0]] )
            user_O1 = pm.math.tanh(pm.math.dot(self.x_u, user_W1))
            user_W2 = pm.Normal('user_W2', 0, sd=1, shape=[layers[0],layers[1]] )
            user_O2 = pm.math.tanh(pm.math.dot(user_O1, user_W2))
            #item layer
            item_W1 = pm.Normal('item_W1', 0, sd=1, shape=[self.maxu, layers[0]] )
            item_O1 = pm.math.tanh(pm.math.dot(self.x_i, item_W1))
            item_W2 = pm.Normal('item_W2', 0, sd=1, shape=[layers[0],layers[1]] )
            item_O2 = pm.math.tanh(pm.math.dot(item_O1, item_W2))
            #output layer
            act_out = pm.math.sigmoid(np.multiply(user_O2, item_O2).sum(axis=1, keepdims=True))
            # Binary classification -> Bernoulli likelihood
            r = pm.Bernoulli('r', act_out, observed=self.y_r)                       
        print('done building BNCF model')
                
    def train_BNCF(self):
        print('start training the BNCF model')
        tstart = time.time()
        with self.bncf:
            inference = pm.ADVI()
            approx = pm.fit(n=1000, method=inference)
            self.trace = approx.sample(draws=500)       
        elapsed = time.time() - tstart    
        print('Completed training the BNCF model in %d seconds' % int(elapsed))
           
    def evaluate_BNCF(self):
        self.x_u.set_value(self.test_u)
        self.x_i.set_value(self.test_i)
        self.y_r.set_value(self.test_r)
        #release memory
        del self.test_u, self.test_i, self.test_r
        gc.collect()
        with self.bncf:#evaluation
            ppc = pm.sample_posterior_predictive(self.trace, progressbar=True)
            pre_r = ppc['r'].mean(axis=0)

            hits = []
            ndcgs = []
            prev_u = self.testList[0][0]
            pos_i = self.testList[0][1]
            scorelist = []
            iLen = 0
            for u, i in self.testList:
                if prev_u == u:
                    scorelist.append([i,pre_r[iLen]])
                else:
                    map_item_score = {}
                    for item, rate in scorelist: #turn dict
                        map_item_score[item] = rate
                    ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
                    hits.append(self.dataset.getHitRatio(ranklist, pos_i))
                    ndcgs.append(self.dataset.getNDCG(ranklist, pos_i))
                    #next user
                    scorelist = []
                    prev_u = u
                    pos_i = i
                    scorelist.append([i,pre_r[iLen]])
                iLen = iLen + 1
            hit, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
            return hit, ndcg
if __name__ == "__main__":
    for fileName in ['ml-1m', 'pinterest-20', 'kb-cc']:
        dataset = DataSet(fileName=fileName, negNum=4)#loading dataset
        model = BNCF(dataset)
        for K in [8, 16, 32, 64]:
            model.build_BNCF(K)
            model.train_BNCF()
            hit, ndcg = model.evaluate_BNCF()
            print("HR@10: {}, NDCG@10: {}, At K {} and Dataset {}".format(hit, ndcg, K, fileName ))

Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444


KeyboardInterrupt: 

In [26]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.23
@function: Implementing BMF(Bayesian Neural Collaborative Filtering) which is designed by Jason.F
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
'''
import sys
import time
import logging

import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as T
import tensorflow as tf
import heapq
import math

class DataSet:
    def __init__(self, fileName, negNum):
        self.negNum = negNum #negative sample ratio
        self.trainList, self.maxu, self.maxi = self.getTrainset_as_list(fileName)
        self.testList = self.getTestset_as_list(fileName)
        
    def getTrainset_as_list(self, fileName):
        if (fileName == 'ml-1m') or (fileName == 'pinterest-20'):
            filePath = "/data/fjsdata/ctKngBase/ml/"+fileName+".train.rating" 
            data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
        if (fileName == 'kb-cc'):
            filePath = "/data/fjsdata/ctKngBase/kbcc_trainset.csv"
            data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
            maxu, maxi = data['user'].max()+1, data['item'].max()+1
            print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
            dataList = data.values.tolist()
            return dataList, maxu, maxi
    
    def getTestset_as_list(self, fileName):
        if (fileName == 'ml-1m') or (fileName == 'pinterest-20'):
            filePath = "/data/fjsdata/ctKngBase/ml/"+fileName+".test.negative" 
            dataList = []
            with open(filePath, 'r') as fd:
                line = fd.readline()
                while line != None and line != '':
                    arr = line.split('\t')
                    u = eval(arr[0])[0]
                    dataList.append([u, eval(arr[0])[1], 1.0])#first is one postive item
                    for i in arr[1:]:
                        dataList.append([u, int(i), 0.0]) #99 negative items
                    line = fd.readline()
            return dataList
        if (fileName == 'kb-cc'):
            filePath = "/data/fjsdata/ctKngBase/kbcc_testset.csv"
            data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
            dataList = data.values.tolist()
            return dataList
        
    def list_to_matrix(self):              
        dataMat = np.zeros([self.maxu, self.maxi], dtype=np.float32)
        for u,i,r in self.trainList:
            dataMat[int(u)][int(i)] = float(1.0)#float(r)
        return np.array(dataMat)
    
    def list_to_dict(self):
        dataDict = {}
        for u,i,r in self.trainList:
            dataDict[int(u), int(i)] = float(1.0)#float(r)
        return dataDict
    
    def getInstances(self, isTest=False):
        #dataMat = self.list_to_matrix(self.trainList, self.maxu, self.maxi)
        user = []
        item = []
        rate = []
        if isTest==True: #test
            for u, i, r in self.testList:
                user.append(int(u))#user.append(dataMat[int(u),:].tolist())
                item.append(int(i))#item.append(dataMat[:,int(i)].tolist())
                rate.append(1.0)#rate.append(float(r))
        else:#train
            for u, i, r in self.trainList:
                user.append(int(u))#user.append(dataMat[int(u),:].tolist())
                item.append(int(i))#item.append(dataMat[:,int(i)].tolist())
                rate.append(1.0)#rate.append(float(r))
            #negative samples
            dataDict = self.list_to_dict()
            for j in range(len(self.trainList)*self.negNum):
                u = np.random.randint(self.maxu)
                i = np.random.randint(self.maxi)
                while (u, i) in dataDict:
                    u = np.random.randint(self.maxu)
                    i = np.random.randint(self.maxi)
                user.append(int(u))#user.append(dataMat[u,:].tolist())
                item.append(int(i))#item.append(dataMat[:,i].tolist())
                rate.append(0.0) 
        return np.array(user), np.array(item), np.array(rate)
    
    def getHitRatio(self, ranklist, targetItem):
        for item in ranklist:
            if item == targetItem:
                return 1
        return 0
    
    def getNDCG(self, ranklist, targetItem):
        for i in range(len(ranklist)):
            item = ranklist[i]
            if item == targetItem:
                return math.log(2) / math.log(i+2)
        return 0
class BNCF:
    def __init__(self, dataset):
        self.dataset = dataset
        #self.trainList = self.dataset.trainList
        #self.testList = self.dataset.testList
        self.maxu = self.dataset.maxu
        self.maxi = self.dataset.maxi
        
        #get the test data
        self.test_u, self.test_i, self.test_r = self.dataset.getInstances(isTest=True)
        #get the training data and setting the input 
        self.train_u, self.train_i, self.train_r = self.dataset.getInstances(isTest=False)
        assert(self.train_u.shape == self.train_i.shape and self.train_i.shape == self.train_r.shape)
        #initiate the seesion
        self.init_sess()
        #train data by mini-batch
        self.arrUser, self.arrItem = self.batchTrainset()
       
    def init_sess(self):
        tf.reset_default_graph()
        
        self.user = tf.placeholder(tf.int32)
        self.item = tf.placeholder(tf.int32)
        self.rate = tf.placeholder(tf.float32)
        self.R = tf.placeholder(tf.float32, shape=(self.maxu, self.maxi))
        #embedding layer
        self.user_item_embedding = tf.convert_to_tensor(self.R)
        self.item_user_embedding = tf.transpose(self.user_item_embedding)
        self.user_input = tf.nn.embedding_lookup(self.user_item_embedding, self.user)
        self.item_input = tf.nn.embedding_lookup(self.item_user_embedding, self.item)
        #define seesion
        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.config.allow_soft_placement = True
        self.sess = tf.Session(config=self.config)
        self.sess.run(tf.global_variables_initializer())
    
    def batchTrainset(self):
        #get input in the way of mini-batch
        batchSize=8192
        num_batches = len(self.train_u) // batchSize + 1
        train_u_batch = self.train_u[0: batchSize]
        train_i_batch = self.train_i[0: batchSize]
        arrUser, arrItem = self.sess.run([self.user_input, self.item_input], \
                                             feed_dict={self.user: train_u_batch, \
                                                        self.item: train_i_batch, \
                                                        self.R: self.dataset.list_to_matrix()})
        '''
        for i in range(1, num_batches):
            min_idx = i * batchSize
            max_idx = np.min([len(self.train_u), (i+1)*batchSize])
            train_u_batch = self.train_u[min_idx: max_idx]
            train_i_batch = self.train_i[min_idx: max_idx]
            train_r_batch = self.train_r[min_idx: max_idx]
            batchUser, batchItem = self.sess.run([self.user_input, self.item_input], \
                                             feed_dict={self.user: train_u_batch, \
                                                        self.item: train_i_batch, \
                                                        self.R: self.dataset.list_to_matrix()})
            arrUser = np.concatenate((arrUser, batchUser),axis=0)
            arrItem = np.concatenate((arrItem, batchItem),axis=0)
        '''
        return arrUser, arrItem
    
    
    def build_BNCF(self, K = 8):
        layers = [1024, K] #number of latent factors
        logging.info('start building the BNCF model')
        
        self.x_u = theano.shared(self.arrUser)
        self.x_i = theano.shared(self.arrItem)
        self.y_r = theano.shared(self.train_r)       
        with pm.Model() as self.bncf:
            #user layer
            user_W1 = pm.Normal('user_W1', 0, sd=1, shape=[self.maxi, layers[0]] )
            user_O1 = pm.math.tanh(pm.math.dot(self.x_u, user_W1))
            user_W2 = pm.Normal('user_W2', 0, sd=1, shape=[layers[0],layers[1]] )
            user_O2 = pm.math.tanh(pm.math.dot(user_O1, user_W2))
            #item layer
            item_W1 = pm.Normal('item_W1', 0, sd=1, shape=[self.maxu, layers[0]] )
            item_O1 = pm.math.tanh(pm.math.dot(self.x_i, item_W1))
            item_W2 = pm.Normal('item_W2', 0, sd=1, shape=[layers[0],layers[1]] )
            item_O2 = pm.math.tanh(pm.math.dot(item_O1, item_W2))
            #output layer
            #act_out = pm.math.sigmoid(pm.math.dot(user_O2, item_O2.T))
            #act_out = pm.math.sigmoid(np.sum(np.multiply(user_O2, item_O2),axis=1, keepdims=True))
            act_out = pm.math.sigmoid(np.multiply(user_O2, item_O2).sum(axis=1, keepdims=True))
            #act_out = pm.math.sigmoid(tf.reduce_sum(tf.multiply(user_O2, item_O2),axis=1, keep_dims=True))
            # Binary classification -> Bernoulli likelihood
            r = pm.Bernoulli('r', act_out, observed=self.y_r, total_size=self.y_r.shape[0]) # IMPORTANT for minibatches                         
        logging.info('done building BNCF model')
                
    def train_BNCF(self):
        logging.info('start training the BNCF model')
        tstart = time.time()
        with self.bncf:
            inference = pm.ADVI()
            approx = pm.fit(n=1000, method=inference)
            self.trace = approx.sample(draws=500)       
        elapsed = time.time() - tstart    
        logging.info('Completed training the BNCF model in %d seconds' % int(elapsed))
           
    def evaluate_BNCF(self):
        arrUser, arrItem = self.sess.run([self.user_input, self.item_input], \
                               feed_dict={self.user: self.test_u, \
                                          self.item: self.test_i, \
                                          self.R: self.dataset.list_to_matrix()})
        self.x_u.set_value(arrUser)
        self.x_i.set_value(arrItem)
        self.y_r.set_value(self.test_r)
        with self.bncf:#evaluation
            ppc = pm.sample_posterior_predictive(self.trace, progressbar=True)
            pre_r = ppc['r'].mean(axis=0)

            hits = []
            ndcgs = []
            prev_u = self.dataset.testList[0][0]
            pos_i = self.dataset.testList[0][1]
            scorelist = []
            iLen = 0
            for u, i in self.dataset.testList:
                if prev_u == u:
                    scorelist.append([i,pre_r[iLen]])
                else:
                    map_item_score = {}
                    for item, rate in scorelist: #turn dict
                        map_item_score[item] = rate
                    ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
                    hits.append(self.dataset.getHitRatio(ranklist, pos_i))
                    ndcgs.append(self.dataset.getNDCG(ranklist, pos_i))
                    #next user
                    scorelist = []
                    prev_u = u
                    pos_i = i
                    scorelist.append([i,pre_r[iLen]])
                iLen = iLen + 1
            hit, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
            return hit, ndcg
if __name__ == "__main__":
    for fileName in ['ml-1m', 'pinterest-20', 'kb-cc']:
        dataset = DataSet(fileName=fileName, negNum=4)#loading dataset
        model = BNCF(dataset)
        for K in [8, 16, 32, 64]:
            model.build_BNCF(K)
            model.train_BNCF()
            hit, ndcg = model.evaluate_BNCF()
            print("HR@10: {}, NDCG@10: {}, At K {} and Dataset {}".format(hit, ndcg, K, fileName ))

In [None]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.22
@function: Implementing BMF(Bayesian Neural Collaborative Filtering) which is designed by Jason.F
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
'''
import sys
import time
import logging

import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import tensorflow as tf

def getTraindata():
    data = pd.read_csv("/data/fjsdata/ctKngBase/ml/ml-1m.train.rating", \
                             sep='\t', header=None, names=['user', 'item', 'rating'], \
                             usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
    maxu, maxi = data['user'].max(), data['item'].max()
    data = data.values.tolist()
    print("Loading Success!\n"
                  "Data Info:\n"
                  "\tUser Num: {}\n"
                  "\tItem Num: {}\n"
                  "\tData Size: {}".format(maxu, maxi, len(data)))
    R = np.zeros([maxu+1, maxi+1], dtype=np.float32)
    for i in data:
        user = int(i[0])
        item = int(i[1])
        rating = float(i[2])
        R[user][item] = rating
    return R, data, maxu, maxi

def getTrainDict(data):
    dataDict = {}
    for i in data:
        dataDict[(i[0], i[1])] = i[2]
    return dataDict
    
def getInstances(R, data, maxu, maxi, negNum):
    user = []
    item = []
    rate = []
    '''
    for i in data:
        user.append(R[int(i[0]),:].tolist())
        item.append(R[:,int(i[1])].tolist())
        rate.append(1.0)
        for t in range(negNum):
            j = np.random.randint(maxi)
            while (i[0], j) in dataDict:
                j = np.random.randint(maxi)
            user.append(R[int(i[0]),:].tolist())
            item.append(R[:,j].tolist())
            rate.append(0.0)
    '''
    for u, i, _ in data:
        user.append(R[int(u),:].tolist())
        item.append(R[:,int(i)].tolist())
        rate.append(1.0)
    '''
    dataDict = getTrainDict(data)
    for j in range(len(data)*negNum):
        u = np.random.randint(maxu)
        i = np.random.randint(maxi)
        while (u, i) in dataDict:
            u = np.random.randint(maxu)
            i = np.random.randint(maxi)
        user.append(R[u,:].tolist())
        item.append(R[:,i].tolist())
        rate.append(0.0)
    '''
    return np.array(user), np.array(item), np.array(rate)

def getTestdata():
    testset = []
    filePath = '/data/fjsdata/ctKngBase/ml/ml-1m.test.negative'
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != '':
            arr = line.split('\t')
            u = eval(arr[0])[0]
            testset.append([u, eval(arr[0])[1], 1.0])#one postive item
            for i in arr[1:]:
                testset.append([u, int(i), 0.0]) #99 negative items
            line = fd.readline()
    return testset


def getTestInstances(R, testset):
    for i in testset:
        user.append(R[int(i[0]),:].tolist())
        item.append(R[:,int(i[1])].tolist())
        rate.append(float(i[2]))
    return np.array(user), np.array(item), np.array(rate)
    
def build_BNCF(x_u, x_i, y_r, maxu, maxi, K=8):
    logging.info('building the BMF model')

    Layers = [1024, K]
    with pm.Model() as bncf:
        #user layer
        user_W1 = pm.Normal('user_W1', 0, sd=1, shape=[maxi+1, Layer[0]] )
        user_O1 = pm.math.tanh(pm.math.dot(x_u, user_W1))
        user_W2 = pm.Normal('user_W2', 0, sd=1, shape=[Layer[0],Layer[1]] )
        user_O2 = pm.math.tanh(pm.math.dot(user_O1, user_W2))
        #item layer
        item_W1 = pm.Normal('item_W1', 0, sd=1, shape=[maxu+1, Layer[0]] )
        item_O1 = pm.math.tanh(pm.math.dot(x_i, item_W1))
        item_W2 = pm.Normal('item_W2', 0, sd=1, shape=[Layer[0],Layer[1]] )
        item_O2 = pm.math.tanh(pm.math.dot(item_O1, item_W2))
        #output layer
        act_out = pm.math.sigmoid(pm.math.dot(user_O2, item_O2.T))
        # Binary classification -> Bernoulli likelihood
        r = pm.Bernoulli('r', act_out, observed=y_r, total_size=y_r.shape[0]) # IMPORTANT for minibatches
                                
    logging.info('done building BMF model')
    
    return bncf

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO,format='[%(asctime)s]: %(message)s')

    # Read data and build BMF model.
    R, data, maxu, maxi = getTraindata()
    train_u, train_i, train_r = getInstances(R, data, maxu, maxi, negNum=4)
    x_u = theano.shared(train_u)
    x_i = theano.shared(train_i)
    y_r = theano.shared(train_r)
    bncf = build_BNCF(x_u, x_i, y_r, maxu, maxi, K=8)#dim is the number of latent factors

    with bncf:# sample with BMF
        tstart = time.time()
        logging.info('Start BMF sampling')
        inference = pm.ADVI()
        approx = pm.fit(n=1000, method=inference)
        trace = approx.sample(draws=500)
        elapsed = time.time() - tstart    
        logging.info('Complete BMF sampling in %d seconds' % int(elapsed))
   

    testset = getTestdata()
    test_u, test_i, test_r = getTestInstances(R, testset)
    x_u.set_value(test_u)
    x_i.set_value(test_i)
    y_r.set_value(test_r)
    with bncf:#evaluation
        ppc = pm.sample_posterior_predictive(trace, progressbar=True)
        pre_r = ppc['r'].mean(axis=0)
        
        hits = []
        ndcgs = []
        prev_u = testset[0][0]
        pos_i = testset[0][1]
        scorelist = []
        iLen = 0
        for u, i in testset:
            if prev_u == u:
                scorelist.append([i,pre_r[iLen]])
            else:
                map_item_score = {}
                for item, rate in scorelist: #turn dict
                    map_item_score[item] = rate
                ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
                hr = getHitRatio(ranklist, pos_i)
                hits.append(hr)
                ndcg = getNDCG(ranklist, pos_i)
                ndcgs.append(ndcg)
                #next user
                scorelist = []
                prev_u = u
                pos_i = i
                scorelist.append([i,pre_r[iLen]])
            iLen = iLen + 1
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        print("hr: {}, NDCG: {}, At K {}".format(hitratio, ndcg, 8))



Loading Success!
Data Info:
	User Num: 6039
	Item Num: 3705
	Data Size: 994169


In [1]:
#矩阵分解R=PQ，推荐概率模型MCMC采样-似然函数是正态
import theano
import pymc3 as pm
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error
import theano.tensor as tt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

#1.数据集处理
#http://files.grouplens.org/datasets/movielens/ml-20m-README.html
#the following format of file ratings.csv: userId,movieId,rating,timestamp
#The lines within this file are ordered first by userId, then, within user, by movieId.
#Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
#Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
data = pd.read_csv("/data/fjsdata/BayesianRS/ml-20m/ratings.csv",sep=',',low_memory=False, iterator =True)
data = data.get_chunk(100)
#将userId和movieId全部标准编号
data_rating = data[['rating']]
le = LabelEncoder()
data = data[['userId','movieId']].apply(le.fit_transform)
data = pd.concat([data,data_rating],axis=1)
#抽样10%比例测试
test = data.sample(frac=0.1)
#2.构建概率模型
#概率模型参数设置
uNum = len(data['userId'].unique())#统计用户数
iNum = len(data['movieId'].unique())#统计电影数
mean= data['rating'].max()/2 #正态分布的均值和方差
k = 100 #隐因子数
X_input = theano.shared(data[['userId','movieId']].values)#转numpy array
Y_output = theano.shared(data['rating'].values)#转numpy array
with pm.Model() as BMF_model:
    # Creating the model
    P = pm.Normal('P', mu=mean, sd=mean, shape=(uNum,k))
    Q = pm.Normal('Q', mu=mean, sd=mean, shape=(k,iNum))
    R = pm.Deterministic('R', tt.dot(P,Q))
    rY = []
    for row in X_input.get_value(): # 获取每行的值
        rr = R[int(row[0])][int(row[1])]#userId是0列,movieId是1列
        rY.append(rr)
    Y = pm.Normal('Y',mu=rY, sd=mean, observed=Y_output.get_value())
#3.后验分布计算  
with BMF_model:        
    start=pm.find_MAP()  # 参数初猜
    #二值变量：指定 BinaryMetropolis  离散变量：指定 Metropolis  连续变量：指定 NUTS
    step = pm.Metropolis()
    trace = pm.sample(1000,start=start,step=step,chains=2,cores=8)

#后验分布采样观察
#pm.traceplot(trace, varnames=['P'])
#pm.summary(trace, varnames=['P'])
#print (trace['P'].shape)
#print (trace['Q'].shape)
#4.后验预测  
#X_input.set_value(test[['userId','movieId']].values)#转numpy array
#Y_output.set_value(test['rating'].values)
with BMF_model:
    ppc = pm.sample_posterior_predictive(trace)#vars=BMF_model.observed_RVs
    pred = ppc['Y'].mean(axis=0)
    
print ('RMSE：%f'% mean_squared_error(Y_output.get_value(),pred))

INFO (theano.gof.compilelock): Waiting for existing lock by process '15568' (I am process '16185')
INFO (theano.gof.compilelock): To manually release the lock, delete /root/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-3.6.6-64/lock_dir
logp = -17,695, ||grad|| = 0.16015: 100%|██████████| 16/16 [00:00<00:00, 219.90it/s]  
Multiprocess sampling (2 chains in 8 jobs)
CompoundStep
>Metropolis: [Q]
>Metropolis: [P]
Sampling 2 chains: 100%|██████████| 3000/3000 [00:06<00:00, 444.96draws/s]
The gelman-rubin statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.
100%|██████████| 2000/2000 [00:03<00:00, 527.49it/s]

RMSE：0.627957





In [2]:
#矩阵分解R=PQ，推荐概率模型ADVI变分推断-似然函数是正态
import theano
import pymc3 as pm
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error
import theano.tensor as tt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

#1.数据集处理
#http://files.grouplens.org/datasets/movielens/ml-20m-README.html
#the following format of file ratings.csv: userId,movieId,rating,timestamp
#The lines within this file are ordered first by userId, then, within user, by movieId.
#Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
#Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
data = pd.read_csv("/data/fjsdata/BayesianRS/ml-20m/ratings.csv",sep=',',low_memory=False, iterator =True)
data = data.get_chunk(100)
#将userId和movieId全部标准编号
data_rating = data[['rating']]
le = LabelEncoder()
data = data[['userId','movieId']].apply(le.fit_transform)
data = pd.concat([data,data_rating],axis=1)
#抽样10%比例测试
test = data.sample(frac=0.1)
#2.构建概率模型
#概率模型参数设置
uNum = len(data['userId'].unique())#统计用户数
iNum = len(data['movieId'].unique())#统计电影数
mean= data['rating'].max()/2 #正态分布的均值和方差
k = 100 #隐因子数
X_input = theano.shared(data[['userId','movieId']].values)#转numpy array
Y_output = theano.shared(data['rating'].values)#转numpy array
with pm.Model() as BMF_model:
    # Creating the model
    P = pm.Normal('P', mu=mean, sd=mean, shape=(uNum,k))
    Q = pm.Normal('Q', mu=mean, sd=mean, shape=(k,iNum))
    R = tt.dot(P,Q)
    rY = []
    for row in X_input.get_value(): # 获取每行的值
        rr = R[int(row[0])][int(row[1])]#userId=0,movieId=1
        rY.append(rr)
    Y = pm.Normal('Y',mu=rY, sd=mean, observed=Y_output.get_value())
#3.后验分布计算  
with BMF_model:        
    inference = pm.ADVI()
    approx = pm.fit(n=10000, method=inference)
    trace = approx.sample(draws=5000)
    
#4.后验预测  
#X_input.set_value(test[['userId','movieId']].values)#转numpy array
#Y_output.set_value(test['rating'].values)
with BMF_model:
    ppc = pm.sample_posterior_predictive(trace)
    pred = ppc['Y'].mean(axis=0)
    
print ('RMSE：%f'% mean_squared_error(Y_output.get_value(),pred))

Average Loss = 5,929.2: 100%|██████████| 10000/10000 [00:39<00:00, 254.78it/s]  
Finished [100%]: Average Loss = 5,921.3
100%|██████████| 5000/5000 [00:10<00:00, 481.68it/s]

RMSE：0.150872



