In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data

In [2]:
# Split Movielens 100K data into train and test (80-20)
np.seed = 1
dataset = pd.read_csv("../data/u.data",sep='\t',names="user_id,item_id,rating,timestamp".split(","))
dataset = dataset.iloc[:,:3]
dataset.user_id = dataset.user_id.astype('category').cat.codes.values
dataset.item_id = dataset.item_id.astype('category').cat.codes.values
train, test = train_test_split(dataset, test_size=0.2)

In [3]:
# Check that we have ratings in the train set for all the users in the 
for test_user in test["user_id"].values:
    if test_user not in train["user_id"].values:
        print("User", test_user, "is in the test set but not in the train set")

In [4]:
test

Unnamed: 0,user_id,item_id,rating
55150,601,456,3
21127,235,503,3
87166,565,384,3
39445,270,61,2
78864,828,732,2
...,...,...,...
97859,937,272,5
78548,880,553,1
96977,715,194,1
67726,667,288,2


# Compute item-item similarities 

In [5]:
# First create the item-user matrix
unique_users = dataset.user_id.unique()
unique_items = dataset.item_id.unique()
data_matrix = np.zeros((unique_items.shape[0], unique_users.shape[0]))

# Use train data to build the similarity matrix
for train_row in train.itertuples():
    data_matrix[train_row.item_id - 1, train_row.user_id - 1] = train_row.rating

print(data_matrix.shape) # We should have a 1682x943 matrix

(1682, 943)


In [6]:
# Compute item-item similarity
item_item_sims = pairwise_distances(data_matrix, metric="cosine")
print(item_item_sims.shape) # We should have a 1682x1682 square matrix where the main diagonal has entries = 0

(1682, 1682)


# Rating prediction

In [7]:
# To perform the rating prediction we must first decide a K number of similar items which we wish to use
# for the calculation. In this demonstration we'll be using 20, just like the implementation in Surprise
def predict_rating(requested_user, requested_item, k=20):
    # Get k most similar items to the requested item
    most_sim_items_ind = np.argpartition(item_item_sims[requested_item-1], -k)[-k:]
    
    # Set up the variables for the rating prediction equation
    nominator = 0
    denominator = 0
    predicted_rating = 0
    
    for sim_item in most_sim_items_ind:
        nominator += item_item_sims[requested_item-1][sim_item]*data_matrix[sim_item][requested_user-1]
        denominator += np.abs(item_item_sims[requested_item-1][sim_item])
    
    if(denominator != 0):     
        predicted_rating = nominator/denominator
        
    return predicted_rating

In [8]:
# E.g Predict the rating of user 238 on item 80
predict_rating(238, 80)

0.0

In [9]:
# Run the prediction on all the items in the test set
item_cf_predictions = []
print("Predicting test set entries..")
for test_row in test.itertuples():
    item_cf_predictions.append(predict_rating(test_row.user_id,test_row.item_id))
print("Prediction done")

Predicting test set entries..
Prediction done


In [10]:
print("MAE on test data:",mean_absolute_error(test.rating.values, item_cf_predictions))
print("RMSE on test data:",np.sqrt(mean_squared_error(test.rating.values, item_cf_predictions)))

MAE on test data: 3.50661
RMSE on test data: 3.6883605775466153
