In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from surprise import KNNWithMeans
from surprise.model_selection import train_test_split
from surprise import Dataset, Reader
from surprise import accuracy

In [2]:
rating_dt = pd.read_csv('D:/Columbia/fall 2020/IEOR 4571/HW2/ml-latest/ratings.csv')

In [3]:
def Item_basedCF(rating):
    ## To perform comparison using the same metric over every model, 
    ##    we implement the second CF method using surprise package
    # This is the Item-based CF model.
    # It finds the similar item to user's record and make recommendations base on that
    # input:  user rating dataset consist of userId, movieId and rating
    # output: the accuracy of the model using RMSE
    
    # read the pandas dataframe as surprise dataframe
    reader = Reader(rating_scale=(0,5))
    data = Dataset.load_from_df(rating,reader)
    
    # set up the kNN model
    my_k = 15
    my_min_k = 5
    my_sim_option = {'name':'pearson',
                     'user_based':False}
    algo = KNNWithMeans(k = my_k, min_k = my_min_k, sim_option = my_sim_option)
    
    # split and train the data
    trainset, testset = train_test_split(data, test_size=0.25)
    predictions = algo.fit(trainset).test(testset)
    acc = accuracy.rmse(predictions)
    return acc

In [4]:
user_ids = rating_dt["userId"].unique().tolist() 
num_all_user = len(user_ids)

# randomly select 20% users from rating dataset 
np.random.seed(123)
rand_userid = np.random.choice(user_ids, size = int(num_all_user * 0.1), replace=False)
sample_df = rating_dt.loc[rating_dt['userId'].isin(rand_userid)]
sample_df = sample_df[['userId','movieId','rating']]

In [14]:
kNN_model = Item_basedCF(sample_df)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9098


In [6]:
# read the pandas dataframe as surprise dataframe
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(sample_df,reader)

# set up the kNN model
my_k = 15
my_min_k = 5
my_sim_option = {'name':'pearson',
                 'user_based':False}
algo = KNNWithMeans(k = my_k, min_k = my_min_k, sim_option = my_sim_option)

# split and train the data
trainset, testset = train_test_split(data, test_size=0.25)
predictions = algo.fit(trainset).test(testset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [7]:
# compute dcg for item-based CF  -- 0.9361632672396146
np.random.seed(123)
userid_random = np.random.choice(user_ids, 100, replace=False)
rel_list = []
for user in userid_random:
    recommended = []
    rate = []
    for i in predictions:
        if i[0] == user and i[3] >= 4.5:
            rate.append(i[3])
            recommended.append(i[1])
        if len(recommended) == 20:
            break
    indx_sorted = np.flip(np.argsort(rate))
    sorted_recommend = np.array(recommended)[indx_sorted]
    rated_movie = sample_df.loc[sample_df['userId'] == user].movieId.tolist()
    rel = []    
    for index, item in enumerate(recommended):
        if item in rated_movie:
            rel.append(1)
        else:
            rel.append(0)
    rel_list.append(rel)
dcg_sum = 0
for r in rel_list:
    for i, value in enumerate(r):
        if value == 1:
            dcg_sum += 1/np.log(i+2)

In [8]:
dcg_sum/100

0.9361632672396146