In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from surprise import BaselineOnly
from surprise.model_selection import train_test_split
from surprise import Dataset, Reader
from surprise import accuracy

In [2]:
rating_dt = pd.read_csv('../data/ratings.csv', 
                        #parse_dates=['timestamp'], 
                        #date_parser=dateparse
                       )

In [3]:
def baseline(rating):
    # This is the baseline bias recommendation model.
    # For every user, the model will calculate the rating of every movie based on user's average rating and movie's average rating.    
    # The function uses surprise package, as our original basline model is extremely time-consuming .
    # input:  user rating dataset consist of userId, movieId and rating
    # output: the accuracy of the model using RMSE
 
    """   ORIGINAL MODEL
    # retrieve all users with their avg rating
    user_avg_rating = rating.groupby(['userId'])['rating'].mean()
    
    # retrieve all movies with their avg rating
    movie_avg_rating = rating.groupby(['movieId'])['rating'].mean()
    
    df = pd.DataFrame(user_avg_rating)
    
    df_T = df.T
    x = pd.concat([df_T]*len(movie_avg_rating))
    x['mId'] = movie_avg_rating.index
    df = x.set_index('mId').T + list(movie_avg_rating)
    
    return df"""

    # read the pandas dataframe as surprise dataframe
    reader = Reader(rating_scale=(0,5))
    data = Dataset.load_from_df(rating,reader)
    
    # set up the baseline model. 
    # since it is a baseline, we set regulation parameters to be 0
    bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 0,
               'reg_i': 0
               }
    algo = BaselineOnly(bsl_options=bsl_options)
    
    # split and train the data
    trainset, testset = train_test_split(data, test_size=0.25)
    predictions = algo.fit(trainset).test(testset)
    acc = accuracy.rmse(predictions)
    
    return acc

In [4]:
user_ids = rating_dt["userId"].unique().tolist() 
num_all_user = len(user_ids)
# randomly select 10% users from rating dataset 
np.random.seed(123)
rand_userid = np.random.choice(user_ids, size = int(num_all_user * 0.1), replace=False)
sample_df = rating_dt.loc[rating_dt['userId'].isin(rand_userid)]
sample_df = sample_df[['userId','movieId','rating']]

In [5]:
b_model = baseline(sample_df)

Estimating biases using als...
RMSE: 0.8750


In [6]:
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(sample_df,reader)

# set up the baseline model. 
# since it is a baseline, we set regulation parameters to be 0
bsl_options = {'method': 'als',
           'n_epochs': 5,
           'reg_u': 0,
           'reg_i': 0
           }
algo = BaselineOnly(bsl_options=bsl_options)

# split and train the data
trainset, testset = train_test_split(data, test_size=0.25)
predictions = algo.fit(trainset).test(testset)

Estimating biases using als...


In [49]:
# get dcg for baseline bias model -- 0.8759653479087667 
np.random.seed(123)
userid_random = np.random.choice(user_ids, 100, replace=False)
rel_list = []
for user in userid_random:
    recommended = []
    rate = []
    for i in predictions:
        if i[0] == user and i[3] >= 4.5:
            rate.append(i[3])
            recommended.append(i[1])
        if len(recommended) == 20:
            break
    indx_sorted = np.flip(np.argsort(rate))
    sorted_recommend = np.array(recommended)[indx_sorted]
    rated_movie = sample_df.loc[sample_df['userId'] == user].movieId.tolist()
    rel = []    
    for index, item in enumerate(recommended):
        if item in rated_movie:
            rel.append(1)
        else:
            rel.append(0)
    rel_list.append(rel)
dcg_sum = 0
for r in rel_list:
    for i, value in enumerate(r):
        if value == 1:
            dcg_sum += 1/np.log(i+2)


In [50]:
dcg_sum/100

0.8759653479087668