# Comparison between PMF and BPMF model

This is a jupyter notebook version of PMF and BPMF with MovieLens 1M dataset examples.

In [1]:
from __future__ import print_function

import os
import logging
import zipfile
import numpy as np
from six.moves import urllib
from numpy.random import RandomState

from recommend.utils.datasets import load_movielens_1m_ratings
from recommend.pmf import PMF
from recommend.bpmf import BPMF
from recommend.utils.evaluation import RMSE

logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
rand_state = RandomState(0)

### Download MovieLens 1M dataset

In [2]:
ML_1M_URL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
ML_1M_FOLDER = "ml-1m"
ML_1M_ZIP_SIZE = 24594131

# download MovieLens 1M dataset if necessary
def ml_1m_download(folder, file_size):
    file_name = "ratings.dat"
    file_path = os.path.join(os.getcwd(), folder, file_name)
    if not os.path.exists(file_path):
        print("file %s not exists. downloading..." % file_path)
        zip_name, _ = urllib.request.urlretrieve(ML_1M_URL, "ml-1m.zip")
        with zipfile.ZipFile(zip_name, 'r') as zf:
            file_path = zf.extract('ml-1m/ratings.dat')

    # check file
    statinfo = os.stat(file_path)
    if statinfo.st_size == file_size:
        print('verify success: %s' % file_path)
    else:
        raise Exception('verify failed: %s' % file_path)
    return file_path

# load or download MovieLens 1M dataset
rating_file = ml_1m_download(ML_1M_FOLDER, file_size=ML_1M_ZIP_SIZE)

verify success: /Users/chyikweiyau/github/jup_notebook/ml-1m/ratings.dat


### Load ratings data, shuffle, and split it to training and validation set

In [3]:
# load ratings data
ratings = load_movielens_1m_ratings(rating_file)
n_user = max(ratings[:, 0])
n_item = max(ratings[:, 1])

# shift user_id & movie_id by 1. let user_id & movie_id start from 0
ratings[:, (0, 1)] -= 1

print("num of user: %d" % n_user)
print("num of item: %d" % n_item)
print("first user id: %d" % min(ratings[:, 0]))
print("first min item id: %d" % min(ratings[:, 1]))

num of user: 6040
num of item: 3952
first user id: 0
first min item id: 0


In [4]:
# split data to training & testing
train_pct = 0.9
rand_state.shuffle(ratings)
train_size = int(train_pct * ratings.shape[0])
train = ratings[:train_size]
validation = ratings[train_size:]

print("training size: %d" % train.shape[0])
print("validation size: %d" % validation.shape[0])

training size: 900188
validation size: 100021


### training PMF and BMPF model

In [5]:
# models settings
n_feature = 10
eval_iters = 20

print("training PMF model...")
pmf = PMF(n_user=n_user, n_item=n_item, n_feature=n_feature,
          epsilon=25., max_rating=5., min_rating=1., seed=0)
pmf.fit(train, n_iters=eval_iters)

print("training BPMF model...")
bpmf = BPMF(n_user=n_user, n_item=n_item, n_feature=n_feature,
            max_rating=5., min_rating=1., seed=0)
bpmf.fit(train, n_iters=eval_iters)

training PMF model...


INFO: iter: 0, train RMSE: 1.107667
INFO: iter: 1, train RMSE: 1.092015
INFO: iter: 2, train RMSE: 1.073484
INFO: iter: 3, train RMSE: 1.058847
INFO: iter: 4, train RMSE: 1.048042
INFO: iter: 5, train RMSE: 1.043992
INFO: iter: 6, train RMSE: 1.034290
INFO: iter: 7, train RMSE: 1.021994
INFO: iter: 8, train RMSE: 1.011770
INFO: iter: 9, train RMSE: 1.002162
INFO: iter: 10, train RMSE: 1.000962
INFO: iter: 11, train RMSE: 0.986822
INFO: iter: 12, train RMSE: 0.970561
INFO: iter: 13, train RMSE: 0.958428
INFO: iter: 14, train RMSE: 0.953011
INFO: iter: 15, train RMSE: 0.945132
INFO: iter: 16, train RMSE: 0.940690
INFO: iter: 17, train RMSE: 0.937240
INFO: iter: 18, train RMSE: 0.931112
INFO: iter: 19, train RMSE: 0.935550


training BPMF model...


INFO: iter: 0, train RMSE: 1.003013
INFO: iter: 1, train RMSE: 0.923810
INFO: iter: 2, train RMSE: 0.883347
INFO: iter: 3, train RMSE: 0.875187
INFO: iter: 4, train RMSE: 0.864099
INFO: iter: 5, train RMSE: 0.854995
INFO: iter: 6, train RMSE: 0.847408
INFO: iter: 7, train RMSE: 0.840626
INFO: iter: 8, train RMSE: 0.836754
INFO: iter: 9, train RMSE: 0.832600
INFO: iter: 10, train RMSE: 0.828685
INFO: iter: 11, train RMSE: 0.825440
INFO: iter: 12, train RMSE: 0.822011
INFO: iter: 13, train RMSE: 0.818412
INFO: iter: 14, train RMSE: 0.815403
INFO: iter: 15, train RMSE: 0.812771
INFO: iter: 16, train RMSE: 0.810059
INFO: iter: 17, train RMSE: 0.807694
INFO: iter: 18, train RMSE: 0.804987
INFO: iter: 19, train RMSE: 0.803278


<recommend.bpmf.BPMF at 0x109a75cd0>

### Training and validation result

In [6]:
# pmf
pmf_train_preds = pmf.predict(train[:, :2])
pmf_train_rmse = RMSE(pmf_train_preds, train[:, 2])
pmf_val_preds = pmf.predict(validation[:, :2])
pmf_val_rmse = RMSE(pmf_val_preds, validation[:, 2])

#bpmf
bpmf_train_preds = bpmf.predict(train[:, :2])
bpmf_train_rmse = RMSE(bpmf_train_preds, train[:, 2])
bpmf_val_preds = bpmf.predict(validation[:, :2])
bpmf_val_rmse = RMSE(bpmf_val_preds, validation[:, 2])

print("PMF training RMSE %.3f, validation RMSE %.3f" % (pmf_train_rmse, pmf_val_rmse))
print("BPMF training RMSE %.3f, validation RMSE %.3f" % (bpmf_train_rmse, bpmf_val_rmse))

PMF training RMSE 0.936, validation RMSE 0.956
BPMF training RMSE 0.803, validation RMSE 0.867


### Predict scores for user 0 and the first 10 items

In [7]:
user_id = 0

pred_items = np.array([[user_id, i] for i in xrange(10)])
# pmf
pmf_pred = pmf.predict(pred_items)
bpmf_pred = bpmf.predict(pred_items)

print("PMF/BPMF prediction for user %d" % user_id)
for i in range(10):
    print("item %d: pmf: %.3f, bpmf: %.3f" % (i, pmf_pred[i], bpmf_pred[i]))

PMF/BPMF prediction for user 0
item 0: pmf: 3.940, bpmf: 4.246
item 1: pmf: 3.462, bpmf: 3.666
item 2: pmf: 3.366, bpmf: 3.642
item 3: pmf: 3.263, bpmf: 3.549
item 4: pmf: 3.329, bpmf: 3.860
item 5: pmf: 3.957, bpmf: 4.401
item 6: pmf: 3.740, bpmf: 3.692
item 7: pmf: 3.444, bpmf: 4.379
item 8: pmf: 3.169, bpmf: 3.734
item 9: pmf: 3.666, bpmf: 3.999
