In [47]:
import pandas as pd
#REMOVE HEADER in data file from case study
cols = ["userId","movieId","rating","timestamp"]
df = pd.read_csv('data/movies/ratings.csv', header=None, names=cols)
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [41]:
from __future__ import print_function

import os
import logging
import zipfile
from six.moves import urllib
from numpy.random import RandomState
from recommend.bpmf import BPMF
from recommend.utils.evaluation import RMSE
from recommend.utils.datasets import load_movielens_1m_ratings
import numpy as np

logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)

rand_state = RandomState(0)

# this program operates off np int array, so we have to round ratings
def load_movie_ratings(ratings_file, separator=','):
    with open(ratings_file) as f:
        ratings = []
        for line in f:
            line = line.split(separator)[:3]
            rate = int(round(float(line[2])+.5))
            if rate > 5 :
                rate = 5
            line = [int(line[0]),int(line[1]),rate] 
            ratings.append(line)
        ratings = np.array(ratings)
    return ratings


ratings = load_movie_ratings('/Users/d4/Dropbox/dsi/07_week/03_recommender/recommend/data/movies/ratings.csv')
n_user = max(ratings[:, 0])
n_item = max(ratings[:, 1])

ratings

array([[   1,   31,    3],
       [   1, 1029,    4],
       [   1, 1061,    4],
       ..., 
       [ 671, 6365,    4],
       [ 671, 6385,    3],
       [ 671, 6565,    4]])

In [42]:


# shift user_id & movie_id by 1. let user_id & movie_id start from 0
ratings[:, (0, 1)] -= 1

# split data to training & testing
train_pct = 0.9

rand_state.shuffle(ratings)
train_size = int(train_pct * ratings.shape[0])
train = ratings[:train_size]
validation = ratings[train_size:]

# models settings
n_feature = 10
eval_iters = 20
print("n_user: %d, n_item: %d, n_feature: %d, training size: %d, validation size: %d" % (
    n_user, n_item, n_feature, train.shape[0], validation.shape[0]))
bpmf = BPMF(n_user=n_user, n_item=n_item, n_feature=n_feature,
            max_rating=5., min_rating=1., seed=0)

bpmf.fit(train, n_iters=eval_iters)
train_preds = bpmf.predict(train[:, :2])
train_rmse = RMSE(train_preds, train[:, 2])
val_preds = bpmf.predict(validation[:, :2])
val_rmse = RMSE(val_preds, validation[:, 2])
print("after %d iteration, train RMSE: %.6f, validation RMSE: %.6f" %
      (eval_iters, train_rmse, val_rmse))


n_user: 671, n_item: 163949, n_feature: 10, training size: 90003, validation size: 10001


INFO: iter: 0, train RMSE: 0.873394
INFO: iter: 1, train RMSE: 0.852109
INFO: iter: 2, train RMSE: 0.845503
INFO: iter: 3, train RMSE: 0.841067
INFO: iter: 4, train RMSE: 0.834059
INFO: iter: 5, train RMSE: 0.818338
INFO: iter: 6, train RMSE: 0.796508
INFO: iter: 7, train RMSE: 0.779712
INFO: iter: 8, train RMSE: 0.769551
INFO: iter: 9, train RMSE: 0.762740
INFO: iter: 10, train RMSE: 0.757583
INFO: iter: 11, train RMSE: 0.755682
INFO: iter: 12, train RMSE: 0.753229
INFO: iter: 13, train RMSE: 0.751368
INFO: iter: 14, train RMSE: 0.749592
INFO: iter: 15, train RMSE: 0.748638
INFO: iter: 16, train RMSE: 0.748394
INFO: iter: 17, train RMSE: 0.746299
INFO: iter: 18, train RMSE: 0.745996
INFO: iter: 19, train RMSE: 0.743808


after 20 iteration, train RMSE: 0.743808, validation RMSE: 0.807491
