In [7]:
#矩阵分解R=PQ，推荐概率模型MCMC采样-直接矩阵采样
import theano
import pymc3 as pm
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error
import theano.tensor as tt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

#1.数据集处理
#http://files.grouplens.org/datasets/movielens/ml-20m-README.html
#the following format of file ratings.csv: userId,movieId,rating,timestamp
#The lines within this file are ordered first by userId, then, within user, by movieId.
#Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
#Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
data = pd.read_csv("/data/fjsdata/BayesianRS/ml-20m/ratings.csv",sep=',',low_memory=False,iterator =True)
data = data.get_chunk(1000)
#将userId和movieId全部标准编号
le = LabelEncoder()
data = data.apply(le.fit_transform)
#2.构建U-I矩阵
uNum = len(data['userId'].unique())#统计用户数
iNum = len(data['movieId'].unique())#统计电影数
UI = np.zeros((uNum, iNum))#转成R矩阵，非常稀疏
for index, row in data.iterrows(): # 获取每行的值
    UI[int(row['userId'])][int(row['movieId'])] = row['rating']
#2.构建概率模型
#概率模型参数设置
mean= data['rating'].max()/2 #正态分布的均值和方差
k = 100 #隐因子数
Y_output = theano.shared(UI)#转numpy array
with pm.Model() as BMF_model:
    # Creating the model
    P = pm.Normal('P', mu=mean, sd=mean, shape=(uNum,k))
    Q = pm.Normal('Q', mu=mean, sd=mean, shape=(k,iNum))
    R = pm.Deterministic('R', tt.dot(P,Q))
    Y = pm.Normal('Y',mu=R, sd=mean, observed=Y_output)
#3.后验分布计算  
with BMF_model:        
    start=pm.find_MAP()  # 参数初猜
    #二值变量：指定 BinaryMetropolis  离散变量：指定 Metropolis  连续变量：指定 NUTS
    step = pm.Metropolis()
    trace = pm.sample(100,start=start,step=step,chains=2,cores=8)

print (trace['R'].shape) #直接用于推荐
with BMF_model:
    ppc = pm.sample_posterior_predictive(trace, progressbar=True)
print (ppc['Y'].shape) #直接用于推荐

logp = -7.7694e+08, ||grad|| = 1.0495e+07: 100%|██████████| 9/9 [00:00<00:00, 41.16it/s]   
Only 100 samples in chain.
Multiprocess sampling (2 chains in 8 jobs)
CompoundStep
>Metropolis: [Q]
>Metropolis: [P]
Sampling 2 chains: 100%|██████████| 1200/1200 [00:04<00:00, 299.73draws/s]
The gelman-rubin statistic is larger than 1.4 for some parameters. The sampler did not converge.
The number of effective samples is smaller than 10% for some parameters.
  6%|▌         | 11/200 [00:00<00:01, 102.93it/s]

(200, 11, 698)


100%|██████████| 200/200 [00:02<00:00, 86.05it/s]

(200, 11, 698)





In [11]:
#矩阵分解R=PQ，推荐概率模型MCMC采样-似然函数是Bernoulli
import theano
import pymc3 as pm
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error
import theano.tensor as tt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

#1.数据集处理
#http://files.grouplens.org/datasets/movielens/ml-20m-README.html
#the following format of file ratings.csv: userId,movieId,rating,timestamp
#The lines within this file are ordered first by userId, then, within user, by movieId.
#Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
#Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
data = pd.read_csv("/data/fjsdata/BayesianRS/ml-20m/ratings.csv",sep=',',low_memory=False,iterator =True)
data = data.get_chunk(100)
#将userId和movieId全部标准编号
le = LabelEncoder()
data = data.apply(le.fit_transform)
data['rating'] = 1
#抽样10%比例测试
test = data.sample(frac=0.1)
#2.构建概率模型
uNum = len(data['userId'].unique())#统计用户数
iNum = len(data['movieId'].unique())#统计电影数
mean= data['rating'].max()/2 #正态分布的均值和方差
k = 100 #隐因子数
X_input = theano.shared(data[['userId','movieId']].values)#转numpy array
Y_output = theano.shared(data['rating'].values)#转numpy array
with pm.Model() as BMF_model:
    # Creating the model
    P = pm.Normal('P', mu=mean, sd=mean, shape=(uNum,k))
    Q = pm.Normal('Q', mu=mean, sd=mean, shape=(k,iNum))
    R = tt.dot(P,Q)
    rY = []
    for row in X_input.get_value(): # 获取每行的值
        rr = R[int(row[0])][int(row[1])]#userId=0,movieId=1
        rY.append(rr)
    rY = pm.Deterministic('rY',pm.math.sigmoid(rY))
    Y = pm.Bernoulli('Y', rY, observed=Y_output.get_value())
#3.后验分布计算  
with BMF_model:        
    start=pm.find_MAP()  # 参数初猜
    #二值变量：指定 BinaryMetropolis  离散变量：指定 Metropolis  连续变量：指定 NUTS
    step = pm.Metropolis()
    trace = pm.sample(100,start=start,step=step,chains=2,cores=8)
    
#4.后验预测  
#X_input.set_value(test[['userId','movieId']].values)#转numpy array
#Y_output.set_value(test['rating'].values)
with BMF_model:
    ppc = pm.sample_posterior_predictive(trace)
    pred = ppc['Y'].mean(axis=0)
    print(pred)
    
print ('RMSE：%f'% mean_squared_error(Y_output.get_value(),pred))

logp = -2,280.5, ||grad|| = 0: 100%|██████████| 2/2 [00:00<00:00, 70.19it/s]
Only 100 samples in chain.
Multiprocess sampling (2 chains in 8 jobs)
CompoundStep
>Metropolis: [Q]
>Metropolis: [P]
Sampling 2 chains: 100%|██████████| 1200/1200 [00:03<00:00, 365.28draws/s]
The gelman-rubin statistic is larger than 1.4 for some parameters. The sampler did not converge.
The number of effective samples is smaller than 10% for some parameters.
100%|██████████| 200/200 [00:00<00:00, 2457.03it/s]

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1.]
RMSE：0.000000



