In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data

In [2]:
# Split Movielens 100K data into train and test (80-20)
np.seed = 1
dataset = pd.read_csv("../data/u.data",sep='\t',names="user_id,item_id,rating,timestamp".split(","))
dataset = dataset.iloc[:,:3]
dataset.user_id = dataset.user_id.astype('category').cat.codes.values
dataset.item_id = dataset.item_id.astype('category').cat.codes.values
train, test = train_test_split(dataset, test_size=0.2)

In [3]:
# Check that we have ratings in the train set for all the users in the 
for test_user in test["user_id"].values:
    if test_user not in train["user_id"].values:
        print("User", test_user, "is in the test set but not in the train set")

In [4]:
test

Unnamed: 0,user_id,item_id,rating
63203,652,52,2
31050,502,560,5
20951,464,113,4
55276,436,185,3
11167,66,471,4
...,...,...,...
42865,584,1020,3
33324,12,232,4
64249,781,1587,3
84898,88,24,5


# Compute user-item similarities 

In [5]:
# First create the user-item matrix
unique_users = dataset.user_id.unique()
unique_items = dataset.item_id.unique()
data_matrix = np.zeros((unique_users.shape[0], unique_items.shape[0]))

# Use train data to build the similarity matrix
for train_row in train.itertuples():
    data_matrix[train_row.user_id - 1, train_row.item_id - 1] = train_row.rating

print(data_matrix.shape) # We should have a 943x1682 matrix

(943, 1682)


In [6]:
# Compute user-user similarity
user_user_sims = pairwise_distances(data_matrix, metric="cosine")
print(user_user_sims.shape) # We should have a 943x943 square matrix where the main diagonal has entries = 0

(943, 943)


# Rating prediction

In [1]:
# To perform the rating prediction we must first decide a K number of similar items which we wish to use
# for the calculation. In this demonstration we'll be using 80
def predict_rating(requested_user, requested_item, k=80):
    # Get k most similar users to the requested user
    most_sim_users_ind = np.argpartition(user_user_sims[requested_user-1], -k)[-k:]
    print(most_sim_users_ind)
    
    # Set up the variables for the rating prediction equation
    nominator = 0
    denominator = 0
    predicted_rating = 0
    mean_requested_user_rating = np.mean(data_matrix[requested_user-1])
    
    for sim_user in most_sim_users_ind:
        mean_sim_user_rating = np.mean(data_matrix[sim_user-1])
        
        nominator += user_user_sims[requested_user-1][sim_user]* \
        (data_matrix[sim_user][requested_item-1]-mean_sim_user_rating)
        
        denominator += user_user_sims[requested_user-1][sim_user]
    
    if(denominator != 0):     
        predicted_rating = mean_requested_user_rating + (nominator/denominator)
        
    return predicted_rating

In [8]:
# Run the prediction on all the items in the test set
user_cf_predictions = []
print("Predicting test set entries..")
for test_row in test.itertuples():
    user_cf_predictions.append(predict_rating(test_row.user_id,test_row.item_id))
print("Prediction done")

Predicting test set entries..
Prediction done


In [9]:
print("MAE on test data:",mean_absolute_error(test.rating.values, user_cf_predictions))
print("RMSE on test data:",np.sqrt(mean_squared_error(test.rating.values, user_cf_predictions)))

MAE on test data: 3.2741359720436556
RMSE on test data: 3.474269887044732
