In [5]:
# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.22
@function: Implementing BMF(Bayesian Neural Collaborative Filtering) which is designed by Jason.F
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
'''
import sys
import time
import logging

import pymc3 as pm
import numpy as np
import pandas as pd
import theano

def getTraindata():
    data = pd.read_csv("/data/fjsdata/ctKngBase/ml/ml-1m.train.rating", \
                             sep='\t', header=None, names=['user', 'item', 'rating'], \
                             usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.float})
    maxu, maxi = data['user'].max(), data['item'].max()
    data = data.values.tolist()
    print("Loading Success!\n"
                  "Data Info:\n"
                  "\tUser Num: {}\n"
                  "\tItem Num: {}\n"
                  "\tData Size: {}".format(maxu, maxi, len(data)))
    R = np.zeros([maxu+1, maxi+1], dtype=np.float32)
    for i in data:
        user = int(i[0])
        item = int(i[1])
        rating = float(i[2])
        R[user][item] = rating
    return R, data, maxu, maxi

def getTrainDict(data):
    dataDict = {}
    for i in data:
        dataDict[(i[0], i[1])] = i[2]
    return dataDict
    
def getInstances(R, data, maxi, negNum):
    dataDict = getTrainDict(data)
    user = []
    item = []
    rate = []
    for i in data:
        user.append(R[int(i[0]),:].tolist())
        item.append(R[:,int(i[1])].tolist())
        rate.append(1.0)
        for t in range(negNum):
            j = np.random.randint(maxi)
            while (i[0], j) in dataDict:
                j = np.random.randint(maxi)
            user.append(R[int(i[0]),:].tolist())
            item.append(R[:,j].tolist())
            rate.append(0.0)
    return np.array(user), np.array(item), np.array(rate)

def getTestdata():
    testset = []
    filePath = '/data/fjsdata/ctKngBase/ml/ml-1m.test.negative'
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != '':
            arr = line.split('\t')
            u = eval(arr[0])[0]
            testset.append([u, eval(arr[0])[1], 1.0])#one postive item
            for i in arr[1:]:
                testset.append([u, int(i), 0.0]) #99 negative items
            line = fd.readline()
    return testset


def getTestInstances(R, testset):
    for i in testset:
        user.append(R[int(i[0]),:].tolist())
        item.append(R[:,int(i[1])].tolist())
        rate.append(float(i[2]))
    return np.array(user), np.array(item), np.array(rate)
    
def build_BNCF(x_u, x_i, y_r, maxu, maxi, K=8):
    logging.info('building the BMF model')

    Layers = [1024, K]
    with pm.Model() as bncf:
        #user layer
        user_W1 = pm.Normal('user_W1', 0, sd=1, shape=[maxi+1, Layer[0]] )
        user_O1 = pm.math.tanh(pm.math.dot(x_u, user_W1))
        user_W2 = pm.Normal('user_W2', 0, sd=1, shape=[Layer[0],Layer[1]] )
        user_O2 = pm.math.tanh(pm.math.dot(user_O1, user_W2))
        #item layer
        item_W1 = pm.Normal('item_W1', 0, sd=1, shape=[maxu+1, Layer[0]] )
        item_O1 = pm.math.tanh(pm.math.dot(x_i, item_W1))
        item_W2 = pm.Normal('item_W2', 0, sd=1, shape=[Layer[0],Layer[1]] )
        item_O2 = pm.math.tanh(pm.math.dot(item_O1, item_W2))
        #output layer
        act_out = pm.math.sigmoid(pm.math.dot(user_O2, item_O2.T))
        # Binary classification -> Bernoulli likelihood
        r = pm.Bernoulli('r', act_out, observed=y_r, total_size=y_r.shape[0]) # IMPORTANT for minibatches
                                
    logging.info('done building BMF model')
    
    return bncf

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO,format='[%(asctime)s]: %(message)s')

    # Read data and build BMF model.
    R, data, maxu, maxi = getTraindata()
    train_u, train_i, train_r = getInstances(R, data, maxi, negNum=4)
    x_u = theano.shared(train_u)
    x_i = theano.shared(train_i)
    y_r = theano.shared(train_r)
    bncf = build_BNCF(x_u, x_i, y_r, maxu, maxi, K=8)#dim is the number of latent factors

    with bncf:# sample with BMF
        tstart = time.time()
        logging.info('Start BMF sampling')
        inference = pm.ADVI()
        approx = pm.fit(n=1000, method=inference)
        trace = approx.sample(draws=500)
        elapsed = time.time() - tstart    
        logging.info('Complete BMF sampling in %d seconds' % int(elapsed))
   

    testset = getTestdata()
    test_u, test_i, test_r = getTestInstances(R, testset)
    x_u.set_value(test_u)
    x_i.set_value(test_i)
    y_r.set_value(test_r)
    with bncf:#evaluation
        ppc = pm.sample_posterior_predictive(trace, progressbar=True)
        pre_r = ppc['r'].mean(axis=0)
        
        hits = []
        ndcgs = []
        prev_u = testset[0][0]
        pos_i = testset[0][1]
        scorelist = []
        iLen = 0
        for u, i in testset:
            if prev_u == u:
                scorelist.append([i,pre_r[iLen]])
            else:
                map_item_score = {}
                for item, rate in scorelist: #turn dict
                    map_item_score[item] = rate
                ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
                hr = getHitRatio(ranklist, pos_i)
                hits.append(hr)
                ndcg = getNDCG(ranklist, pos_i)
                ndcgs.append(ndcg)
                #next user
                scorelist = []
                prev_u = u
                pos_i = i
                scorelist.append([i,pre_r[iLen]])
            iLen = iLen + 1
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        print("hr: {}, NDCG: {}, At K {}".format(hitratio, ndcg, 8))

Loading Success!
Data Info:
	User Num: 6039
	Item Num: 3705
	Data Size: 994169


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [1]:
#矩阵分解R=PQ，推荐概率模型MCMC采样-似然函数是正态
import theano
import pymc3 as pm
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error
import theano.tensor as tt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

#1.数据集处理
#http://files.grouplens.org/datasets/movielens/ml-20m-README.html
#the following format of file ratings.csv: userId,movieId,rating,timestamp
#The lines within this file are ordered first by userId, then, within user, by movieId.
#Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
#Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
data = pd.read_csv("/data/fjsdata/BayesianRS/ml-20m/ratings.csv",sep=',',low_memory=False, iterator =True)
data = data.get_chunk(100)
#将userId和movieId全部标准编号
data_rating = data[['rating']]
le = LabelEncoder()
data = data[['userId','movieId']].apply(le.fit_transform)
data = pd.concat([data,data_rating],axis=1)
#抽样10%比例测试
test = data.sample(frac=0.1)
#2.构建概率模型
#概率模型参数设置
uNum = len(data['userId'].unique())#统计用户数
iNum = len(data['movieId'].unique())#统计电影数
mean= data['rating'].max()/2 #正态分布的均值和方差
k = 100 #隐因子数
X_input = theano.shared(data[['userId','movieId']].values)#转numpy array
Y_output = theano.shared(data['rating'].values)#转numpy array
with pm.Model() as BMF_model:
    # Creating the model
    P = pm.Normal('P', mu=mean, sd=mean, shape=(uNum,k))
    Q = pm.Normal('Q', mu=mean, sd=mean, shape=(k,iNum))
    R = pm.Deterministic('R', tt.dot(P,Q))
    rY = []
    for row in X_input.get_value(): # 获取每行的值
        rr = R[int(row[0])][int(row[1])]#userId是0列,movieId是1列
        rY.append(rr)
    Y = pm.Normal('Y',mu=rY, sd=mean, observed=Y_output.get_value())
#3.后验分布计算  
with BMF_model:        
    start=pm.find_MAP()  # 参数初猜
    #二值变量：指定 BinaryMetropolis  离散变量：指定 Metropolis  连续变量：指定 NUTS
    step = pm.Metropolis()
    trace = pm.sample(1000,start=start,step=step,chains=2,cores=8)

#后验分布采样观察
#pm.traceplot(trace, varnames=['P'])
#pm.summary(trace, varnames=['P'])
#print (trace['P'].shape)
#print (trace['Q'].shape)
#4.后验预测  
#X_input.set_value(test[['userId','movieId']].values)#转numpy array
#Y_output.set_value(test['rating'].values)
with BMF_model:
    ppc = pm.sample_posterior_predictive(trace)#vars=BMF_model.observed_RVs
    pred = ppc['Y'].mean(axis=0)
    
print ('RMSE：%f'% mean_squared_error(Y_output.get_value(),pred))

INFO (theano.gof.compilelock): Waiting for existing lock by process '15568' (I am process '16185')
INFO (theano.gof.compilelock): To manually release the lock, delete /root/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-3.6.6-64/lock_dir
logp = -17,695, ||grad|| = 0.16015: 100%|██████████| 16/16 [00:00<00:00, 219.90it/s]  
Multiprocess sampling (2 chains in 8 jobs)
CompoundStep
>Metropolis: [Q]
>Metropolis: [P]
Sampling 2 chains: 100%|██████████| 3000/3000 [00:06<00:00, 444.96draws/s]
The gelman-rubin statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.
100%|██████████| 2000/2000 [00:03<00:00, 527.49it/s]

RMSE：0.627957





In [2]:
#矩阵分解R=PQ，推荐概率模型ADVI变分推断-似然函数是正态
import theano
import pymc3 as pm
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error
import theano.tensor as tt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

#1.数据集处理
#http://files.grouplens.org/datasets/movielens/ml-20m-README.html
#the following format of file ratings.csv: userId,movieId,rating,timestamp
#The lines within this file are ordered first by userId, then, within user, by movieId.
#Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
#Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
data = pd.read_csv("/data/fjsdata/BayesianRS/ml-20m/ratings.csv",sep=',',low_memory=False, iterator =True)
data = data.get_chunk(100)
#将userId和movieId全部标准编号
data_rating = data[['rating']]
le = LabelEncoder()
data = data[['userId','movieId']].apply(le.fit_transform)
data = pd.concat([data,data_rating],axis=1)
#抽样10%比例测试
test = data.sample(frac=0.1)
#2.构建概率模型
#概率模型参数设置
uNum = len(data['userId'].unique())#统计用户数
iNum = len(data['movieId'].unique())#统计电影数
mean= data['rating'].max()/2 #正态分布的均值和方差
k = 100 #隐因子数
X_input = theano.shared(data[['userId','movieId']].values)#转numpy array
Y_output = theano.shared(data['rating'].values)#转numpy array
with pm.Model() as BMF_model:
    # Creating the model
    P = pm.Normal('P', mu=mean, sd=mean, shape=(uNum,k))
    Q = pm.Normal('Q', mu=mean, sd=mean, shape=(k,iNum))
    R = tt.dot(P,Q)
    rY = []
    for row in X_input.get_value(): # 获取每行的值
        rr = R[int(row[0])][int(row[1])]#userId=0,movieId=1
        rY.append(rr)
    Y = pm.Normal('Y',mu=rY, sd=mean, observed=Y_output.get_value())
#3.后验分布计算  
with BMF_model:        
    inference = pm.ADVI()
    approx = pm.fit(n=10000, method=inference)
    trace = approx.sample(draws=5000)
    
#4.后验预测  
#X_input.set_value(test[['userId','movieId']].values)#转numpy array
#Y_output.set_value(test['rating'].values)
with BMF_model:
    ppc = pm.sample_posterior_predictive(trace)
    pred = ppc['Y'].mean(axis=0)
    
print ('RMSE：%f'% mean_squared_error(Y_output.get_value(),pred))

Average Loss = 5,929.2: 100%|██████████| 10000/10000 [00:39<00:00, 254.78it/s]  
Finished [100%]: Average Loss = 5,921.3
100%|██████████| 5000/5000 [00:10<00:00, 481.68it/s]

RMSE：0.150872



