In [11]:
# coding:utf-8  
'''
@author: Jason.F
@data: 2019.07.29
@function: BMF(Bayesian Matrix Factorization) 
           Datatset: MovieLens-1m:https://grouplens.org/datasets/movielens/  
           Evaluation: RMSE
'''
import sys
import time
import logging
import random
import heapq
import math
from collections import defaultdict
import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as tt
import tensorflow as tf

class DataSet:
    def __init__(self):
        self.trainset, self.testset, self.maxu, self.maxi, self.maxr = self._getDataset_as_list()
        
    def _getDataset_as_list(self):
        #trainset
        filePath = "/data/fjsdata/BMF/ml-1m.train.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        maxu, maxi, maxr = data['user'].max()+1, data['item'].max()+1, data['rating'].max()
        print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % \
                  (data.shape[0], maxu, maxi, data.shape[0]/(maxu*maxi)))
        trainset = data.values.tolist()
        #testset
        filePath = "/data/fjsdata/BMF/ml-1m.test.rating" 
        data = pd.read_csv(filePath, sep='\t', header=None, names=['user', 'item', 'rating'], \
                                 usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
        testset = data.values.tolist()
        return trainset, testset, maxu, maxi, maxr 
    
    def list_to_matrix(self, dataset, maxu, maxi):              
        dataMat = np.zeros([maxu, maxi], dtype=np.float32)
        for u,i,r in dataset:
            dataMat[int(u)][int(i)] = float(r)
        return np.array(dataMat)
    
    def list_to_dict(self, dataset):
        dataDict = {}
        for u,i,r in dataset:
            dataDict[int(u), int(i)] = float(r)
        return dataDict
    
    def getInstances(self, dataset, maxu=None, maxi=None, num_ng=None, isTest=True):
        user = []
        item = []
        rate = []
        if isTest==True: #test
            for u, i, r in dataset:
                user.append(int(u))
                item.append(int(i))
                rate.append(float(r))
        else:#train
            for u, i, r in dataset:
                user.append(int(u))
                item.append(int(i))
                rate.append(float(r))
            #negative samples
            dataDict = self.list_to_dict(dataset)
            for j in range(len(dataset)*num_ng):
                u = np.random.randint(maxu)
                i = np.random.randint(maxi)
                while (u, i) in dataDict:
                    u = np.random.randint(maxu)
                    i = np.random.randint(maxi)
                user.append(int(u))
                item.append(int(i))
                rate.append(float(0.0)) 
        return np.array(user), np.array(item), np.array(rate)
    
class BMF():
    def __init__(self, ds, num_ng=2):
        self.maxr = ds.maxr
        self.maxu = ds.maxu
        self.maxi = ds.maxi
        #get the trainset and testset
        train_u, train_i, train_r = ds.getInstances(ds.trainset, ds.maxu, ds.maxi, num_ng, isTest=False)
        shuffled_idx = np.random.permutation(np.arange(len(train_u)))
        self.train_u = train_u[shuffled_idx]
        self.train_i = train_i[shuffled_idx]
        self.train_r = train_r[shuffled_idx]
        assert(len(self.train_u) == len(self.train_i) and len(self.train_i) == len(self.train_r)) 
        self.test_u, self.test_i, self.test_r = ds.getInstances(ds.testset, isTest=True)
        assert(len(self.test_u) == len(self.test_i) and len(self.test_i) == len(self.test_r))
    
    def train_BMF(self, K=8):
        meanr = self.maxr/2
        self.x_u = theano.shared(self.train_u)
        self.x_i = theano.shared(self.train_i)
        self.y_r = theano.shared(self.train_r)
        with pm.Model() as self.bmf:#bulid probabilistic model
            # Creating the model
            P = pm.Normal('P', mu=0, sd=1, shape=(self.maxu, K))
            Q = pm.Normal('Q', mu=0, sd=1, shape=(self.maxi, K))
            tY = pm.Deterministic('tY', pm.math.sum(P[self.x_u,:]*Q[self.x_i,:], axis=1))
            Y = pm.Normal('Y', mu=tY, sd=meanr, observed=self.y_r)
            
        with self.bmf: #train the probabilistic model by Bayesian inference
            tstart = time.time()
            logging.info('Start training BMF')
            approx = pm.fit(n=1000, method=pm.ADVI())
            trace = approx.sample(draws=500)
            elapsed = time.time() - tstart 
            logging.info('Complete BMF training in %d seconds' % int(elapsed))
        return trace
            
    def eval_BMF(self, trace):
        self.x_u.set_value(self.test_u)
        self.x_i.set_value(self.test_i)
        self.y_r.set_value(self.test_r)
        with self.bmf:
            ppc = pm.sample_posterior_predictive(trace, progressbar=True)
            pY = ppc['Y'].mean(axis=0)
        assert(pY.shape[0]==self.test_r.shape[0])
        squaredError = []
        for i in range(pY.shape[0]):
            error=self.test_r[i] - pY[i]
            squaredError.append(error * error)
        rmse =math.sqrt(sum(squaredError) / len(squaredError))
        return rmse
    
if __name__ == "__main__":
    ds = DataSet()#loading dataset
    bmf = BMF(ds, num_ng=2)#negative sample ratio
    for K in [8, 16, 32, 64]:
        trace = bmf.train_BMF(K)
        rmse = bmf.eval_BMF(trace)
        print("RMSE@{}:{}".format(K, rmse))

Dataset Statistics: Interaction = 994169, User = 6040, Item = 3706, Sparsity = 0.0444


INFO (theano.gof.compilelock): Waiting for existing lock by process '25732' (I am process '25192')
I0731 03:32:59.981074 139912595539712 compilelock.py:267] Waiting for existing lock by process '25732' (I am process '25192')
INFO (theano.gof.compilelock): To manually release the lock, delete /root/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-3.6.6-64/lock_dir
I0731 03:32:59.985890 139912595539712 compilelock.py:269] To manually release the lock, delete /root/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-3.6.6-64/lock_dir
INFO (theano.gof.compilelock): Waiting for existing lock by process '27610' (I am process '25192')
I0731 03:33:55.332922 139912595539712 compilelock.py:267] Waiting for existing lock by process '27610' (I am process '25192')
INFO (theano.gof.compilelock): To manually release the lock, delete /root/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-3.6.6-64/lock_dir
I0731 03:33:55.3

RMSE@8:34.563844699884385


Average Loss = 2.042e+09:  56%|█████▌    | 561/1000 [15:20<11:41,  1.60s/it] 
Interrupted at 561 [56%]: Average Loss = 1.9754e+09
I0731 04:09:20.107150 139912595539712 inference.py:240] Interrupted at 561 [56%]: Average Loss = 1.9754e+09
E0731 04:09:43.160345 139912595539712 ultratb.py:155] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3325, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-d89bba26075a>", line 137, in <module>
    trace = bmf.train_BMF(K)
  File "<ipython-input-11-d89bba26075a>", line 113, in train_BMF
    trace = approx.sample(draws=500)
  File "/usr/local/lib/python3.6/dist-packages/pymc3/variational/opvi.py", line 1597, in sample
    trace.record(point)
  File "/usr/local/lib/python3.6/dist-packages/pymc3/backends/ndarray.py", line 229, in record
    for varname, value in zip(self.varnames, self.fn(point)):
  File "/usr/local/lib/python3.6/dist-packages/pymc3/model.py", line 1173, in __call__
    return self.f(**state)
  File "/usr/local/lib/python3.6/dist-packages/theano/compile/function_module.py", line 903, in __call__
    self.fn() if output_subset is None else\
KeyboardInterrupt

During handling of the above exception, another exc

KeyboardInterrupt: 