In [None]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.08.10
@function: BMF(Bayesian Matrix Factorization) 
           Datatset: MovieLens-1m:https://grouplens.org/datasets/movielens/  
           Evaluation: RMSE
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self):
        self.trainset, self.testset, self.maxu, self.maxi, self.maxr = self._getDataset_as_list()
        
    def _getDataset_as_list(self):
        #trainset
        filePath = "/data/fjsdata/BMF/ml-1m.train.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        maxu, maxi, maxr = data['user'].max()+1, data['item'].max()+1, data['rating'].max()
        print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
        trainset = data.values.tolist()
        #testset
        filePath = "/data/fjsdata/BMF/ml-1m.test.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        testset = data.values.tolist()
        return trainset, testset, maxu, maxi, maxr 
    
    def list_to_matrix(self, dataset, maxu, maxi):              
        dataMat = np.zeros([maxu, maxi], dtype=np.float32)
        for u,i,r in dataset:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
    def list_to_dict(self, dataset):
        dataDict = {}
        for u,i,r in dataset:
            dataDict[int(u), int(i)] = float(r)
        return dataDict
    
    def getInstances(self, dataset):
        user = []
        item = []
        rate = []
        for u, i, r in dataset:
            user.append(int(u))
            item.append(int(i))
            rate.append(float(r))
        return np.array(user), np.array(item), np.array(rate)
    
class BMF():
    def __init__(self, ds):
        self.maxr = ds.maxr
        self.maxu = ds.maxu
        self.maxi = ds.maxi
        #get the trainset and testset
        train_u, train_i, train_r = ds.getInstances(ds.trainset)
        shuffled_idx = np.random.permutation(np.arange(len(train_u)))
        self.train_u = train_u[shuffled_idx]
        self.train_i = train_i[shuffled_idx]
        self.train_r = train_r[shuffled_idx]
        assert(len(self.train_u) == len(self.train_i) and len(self.train_i) == len(self.train_r)) 
        self.test_u, self.test_i, self.test_r = ds.getInstances(ds.testset)
        assert(len(self.test_u) == len(self.test_i) and len(self.test_i) == len(self.test_r))
    
    def train_BMF(self, K=8):
        meanr = self.maxr/2
        self.x_u = theano.shared(self.train_u)
        self.x_i = theano.shared(self.train_i)
        self.y_r = theano.shared(self.train_r)
        with pm.Model() as self.bmf:#bulid probabilistic model
            # Creating the model
            P = pm.Normal('P', mu=0, sd=meanr, shape=(self.maxu, K))
            Q = pm.Normal('Q', mu=0, sd=meanr, shape=(self.maxi, K))
            tY = pm.Deterministic('tY', pm.math.sum(P[self.x_u,:]*Q[self.x_i,:], axis=1))
            Y = pm.Normal('Y', mu=tY, sd=meanr, observed=self.y_r)#pm.Categorical
            
        with self.bmf: #train the probabilistic model by Bayesian inference
            tstart = time.time()
            logging.info('Start training BMF')
            mapst=pm.find_MAP() #get the map point
            approx = pm.fit(n=10000, method=pm.ADVI())
            trace = approx.sample(draws=500)
            elapsed = time.time() - tstart 
            logging.info('Complete BMF training in %d seconds' % int(elapsed))
        return trace, mapst
            
    def eval_BMF(self, trace):
        self.x_u.set_value(self.test_u)
        self.x_i.set_value(self.test_i)
        self.y_r.set_value(self.test_r)
        with self.bmf:
            ppc = pm.sample_posterior_predictive(trace, progressbar=True)
            pY = ppc['Y'].mean(axis=0)
            #pY = np.max(ppc['Y'],axis=0)
        assert(pY.shape[0]==self.test_r.shape[0])
        squaredError = []
        for i in range(pY.shape[0]):
            error=self.test_r[i] - pY[i]
            squaredError.append(error * error)
        rmse =math.sqrt(sum(squaredError) / len(squaredError))
        return rmse
    
if __name__ == "__main__":
    ds = DataSet()#loading dataset
    bmf = BMF(ds)
    K=8
    trace,mapst = bmf.train_BMF(K)
    rmse = bmf.eval_BMF(trace)
    print("RMSE@{}:{}".format(K, rmse))

In [1]:
import numpy as np
import pandas as pd

filePath = "/data/fjsdata/BMF/ml-1m.train.rating" 
data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
maxu, maxi, maxr = data['user'].max()+1, data['item'].max()+1, data['rating'].max()
print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
rs = pd.DataFrame([data.mean(), data.std(), data.var()], index=['Mean', 'Std. dev', 'Variance'])
print (rs)

Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444
                  user           item    rating
Mean      3.023537e+03     872.860828  3.581378
Std. dev  1.728320e+03     738.213243  1.116791
Variance  2.987090e+06  544958.792605  1.247222


In [2]:
import numpy as np
import pandas as pd

filePath = "/data/fjsdata/BMF/kbcc_trainset.csv" 
data = pd.read_csv(filePath, sep='|', low_memory=False, dtype={'csr':int, 'ke':int, 'num':float})
maxu, maxi, maxr = data['csr'].max()+1, data['ke'].max()+1, data['num'].max()
print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
rs = pd.DataFrame([data.mean(), data.std(), data.var()], index=['Mean', 'Std. dev', 'Variance'])
print (rs)

Dataset Statistics: Interaction = 2313189, User = 10216, Item = 96324, Sparsity = 0.0024
                   csr            ke       num
Mean      4.788496e+03  5.249678e+04  2.208648
Std. dev  2.657450e+03  2.589617e+04  1.961752
Variance  7.062041e+06  6.706117e+08  3.848470
