### Memory-Based

Data Loading and Cleaning

In [1]:
import pandas as pd
import numpy as np
# import the ratings data and save as pandas data frame
data = [i.split("::") for i in open("./data/ml-1m/ratings.dat").readlines()]
data = pd.DataFrame(data, columns=["user", "item", "rating", "timestamp"]).drop(columns=['timestamp'])

user_ids, _ = pd.factorize(data['user'])
recipe_ids, _ = pd.factorize(data['item'])
data['user'] = user_ids
data['item'] = recipe_ids

# create the user item matrix by pivoting the table and normalize it
user_item_matrix = data.pivot_table(index='user', columns='item', values='rating', fill_value=2.5)
normalized_ui_matrix = user_item_matrix.subtract(user_item_matrix.mean(axis=1), axis=0)

Similarity Matrix

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
# cosine similarity matrix, user-based and item-based
cosine_user_sim = cosine_similarity(normalized_ui_matrix)
cosine_item_sim = cosine_similarity(normalized_ui_matrix.T)
# pearson similarity matrix, user-based and item-based
pearson_user_sim = normalized_ui_matrix.T.corr(method="pearson")
pearson_user_sim = pearson_user_sim.to_numpy() # convert to numpy array for compability
pearson_item_sim = normalized_ui_matrix.corr(method="pearson")
pearson_item_sim = pearson_item_sim.to_numpy() # convert to numpy array for compability

Prediction

In [77]:
def predict(type, idx, top_n, sim_matrix):
    """
    input: index of target user/item (int), top n number of users to use (int), similarity matrix of choice (array)
    output: return the predicted item row for target user (user-based) or user row for target item (item-based)
    """
    # filter out top n similar users/items
    similarities = sorted(sim_matrix[idx], reverse=True)[1:top_n+1]
    sim_dict = {}
    # loop through all users/items and append their index and similarity value to 'sim_dict'
    for similarity in similarities:
        sim_dict[np.where(sim_matrix[idx] == similarity)[0][0]] = similarity
    # create empty matrix where the prediction vector is going be saved
    if type == 'user':
        result = np.zeros(len(normalized_ui_matrix.columns))
        for user in sim_dict:
            # get scores from user item matrix using index of user
            score = normalized_ui_matrix.loc[user]
            # get weights of user
            weight = sim_dict[user]
            # weighted average for prediction (average is taken at last line)
            result += np.dot(score, weight)
    elif type == "item":
        result = np.zeros(len(normalized_ui_matrix))
        for item in sim_dict:
            # get scores from user item matrix using index of item
            score = normalized_ui_matrix[item]
            # get weights of item
            weight = sim_dict[item]
            # weighted average for prediction (average is taken at last line)
            result += np.dot(score, weight)
    else:
        return "Enter valid memory-based RS type"
    
    return result/sum(sim_dict.values())

In [86]:
def predicted_ui_matrix(type, top_n, sim_matrix):
    pred = []
    if type == 'user':
        for i in range(len(normalized_ui_matrix)):
            pred.append(predict(type, i, top_n, sim_matrix))
        pred = np.array(pred)
        return pred
    elif type == 'item':
        for i in range(len(normalized_ui_matrix.loc[0])):
            pred.append(predict(type, i, top_n, sim_matrix))
        pred = np.array(pred)
        return pred.T
    else:
        return "Enter 'user' for user-based and 'item' for item-based CF RS"

Evaluate

In [5]:
from scipy.stats import spearmanr

def evaluate(type, top_n, sim_matrix):
    pred = predicted_ui_matrix(type, top_n, sim_matrix)
    true = np.array(normalized_ui_matrix.values.tolist())
    # calculate RMSE, MAE, and Spearman correlation
    rmse = np.sqrt(np.mean((pred-true)**2))
    mae = np.mean(abs(pred-true))
    spearman_scores = []
    for i in range(len(true)):
        true_rank = np.argsort(true[i])
        predicted_rank = np.argsort(pred[i])
        spearman_scores.append(spearmanr(true_rank, predicted_rank).correlation)
    average_spearman_score = np.mean(spearman_scores)
    print("RMSE: ", rmse, "\nMAE: ", mae, "\nSpearman: ", average_spearman_score)

In [89]:
print("User-based approach with cosine similarity")
print(evaluate('user', 50, cosine_user_sim), "\n---------")

print("User-based approach with pearson similarity")
print(evaluate('user', 50, pearson_user_sim), "\n---------")

print("Item-based approach with cosine similarity")
print(evaluate('item', 50, cosine_item_sim), "\n---------")

print("Item-based approach with pearson similarity")
print(evaluate('item', 50, pearson_item_sim), "\n---------")

User-based approach with cosine similarity
RMSE:  0.26672632258637224 
MAE:  0.0943938468652809 
Spearman:  0.5938959359206174
None 
---------
User-based approach with pearson similarity
RMSE:  0.2667263225863723 
MAE:  0.09439384686528093 
Spearman:  0.5938959349277827
None 
---------
Item-based approach with cosine similarity
RMSE:  0.28283421661104957 
MAE:  0.09080784956365832 
Spearman:  0.23046860395364357
None 
---------
Item-based approach with pearson similarity
RMSE:  0.2734087079745447 
MAE:  0.08185400557506971 
Spearman:  0.24259847846933547
None 
---------


### Model-Based

In [90]:
from surprise import Reader, Dataset
reader = Reader(rating_scale=(0.5, 5.0))
df = Dataset.load_from_df(data[['user', 'item', 'rating']], reader)

In [91]:
from surprise.model_selection import cross_validate
from surprise import Dataset, Reader, SVD
# pick the SVD algorithm and run it with the data and evaluation metrics
algo = SVD()
cross_validate(algo, df, measures=["RMSE", "MAE"], cv=50, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 50 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Fold 11 Fold 12 Fold 13 Fold 14 Fold 15 Fold 16 Fold 17 Fold 18 Fold 19 Fold 20 Fold 21 Fold 22 Fold 23 Fold 24 Fold 25 Fold 26 Fold 27 Fold 28 Fold 29 Fold 30 Fold 31 Fold 32 Fold 33 Fold 34 Fold 35 Fold 36 Fold 37 Fold 38 Fold 39 Fold 40 Fold 41 Fold 42 Fold 43 Fold 44 Fold 45 Fold 46 Fold 47 Fold 48 Fold 49 Fold 50 Mean    Std     
RMSE (testset)    0.8661  0.8606  0.8575  0.8515  0.8585  0.8562  0.8626  0.8599  0.8547  0.8550  0.8565  0.8626  0.8583  0.8618  0.8564  0.8638  0.8671  0.8616  0.8598  0.8573  0.8655  0.8547  0.8572  0.8624  0.8622  0.8546  0.8635  0.8533  0.8632  0.8650  0.8551  0.8565  0.8602  0.8559  0.8493  0.8648  0.8596  0.8617  0.8678  0.8515  0.8583  0.8612  0.8636  0.8659  0.8599  0.8604  0.8564  0.8648  0.8601  0.8634  0.8597  0.0043  
MAE (testset)     0.6774  0.6733  0.6736  0.6688  0.6725  0.6701  0.6777  0

{'test_rmse': array([0.86609678, 0.8606188 , 0.85754558, 0.8515119 , 0.85846193,
        0.85621684, 0.86258708, 0.85987843, 0.85466953, 0.85504155,
        0.85652473, 0.86262736, 0.85833425, 0.86177467, 0.85643172,
        0.86376743, 0.86707588, 0.86163477, 0.85982011, 0.85728305,
        0.86552566, 0.85466266, 0.85716838, 0.8624132 , 0.86217122,
        0.85462665, 0.86352042, 0.85331442, 0.86315734, 0.86504144,
        0.85514875, 0.85648273, 0.86020175, 0.85592816, 0.84933388,
        0.86477834, 0.85962964, 0.86171372, 0.86781709, 0.85146781,
        0.85828187, 0.86119173, 0.86364406, 0.8659064 , 0.85985259,
        0.8603977 , 0.8564401 , 0.86483063, 0.86010419, 0.86340547]),
 'test_mae': array([0.6773777 , 0.67332215, 0.67360297, 0.66881312, 0.67251459,
        0.67007171, 0.6776946 , 0.67446851, 0.66931034, 0.6704911 ,
        0.67048474, 0.67760917, 0.67300305, 0.67365388, 0.67032957,
        0.67576913, 0.68060997, 0.67493684, 0.67512997, 0.67335841,
        0.6792223 , 0

In [92]:
from surprise import accuracy
from surprise.model_selection import train_test_split

# here we split the data into training and testing and fit the SVD model to the data
trainset, testset = train_test_split(df, test_size=0.3)
algo = SVD()
algo.fit(trainset)
prediction = algo.test(testset)
accuracy.rmse(prediction)
accuracy.mae(prediction)

RMSE: 0.8824
MAE:  0.6938


0.6938167481451086