### Memory-Based

Data Loading and Cleaning

In [1]:
import pandas as pd
import numpy as np
# import the ratings data and save as pandas data frame
data = [i.split("::") for i in open("./data/ml-1m/ratings.dat").readlines()]
data = pd.DataFrame(data, columns=["user", "item", "rating", "timestamp"]).drop(columns=['timestamp'])

user_ids, _ = pd.factorize(data['user'])
recipe_ids, _ = pd.factorize(data['item'])
data['user'] = user_ids
data['item'] = recipe_ids

# create the user item matrix by pivoting the table and normalize it
user_item_matrix = data.pivot_table(index='user', columns='item', values='rating', fill_value=2.5)
normalized_ui_matrix = user_item_matrix.subtract(user_item_matrix.mean(axis=1), axis=0)

Similarity Matrix

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
# cosine similarity matrix, user-based and item-based
cosine_user_sim = cosine_similarity(normalized_ui_matrix)
cosine_item_sim = cosine_similarity(normalized_ui_matrix.T)
# pearson similarity matrix, user-based and item-based
pearson_user_sim = normalized_ui_matrix.T.corr(method="pearson")
pearson_user_sim = pearson_user_sim.to_numpy() # convert to numpy array for compability
pearson_item_sim = normalized_ui_matrix.corr(method="pearson")
pearson_item_sim = pearson_item_sim.to_numpy() # convert to numpy array for compability

Prediction

In [77]:
def predict(type, idx, top_n, sim_matrix):
    """
    input: index of target user/item (int), top n number of users to use (int), similarity matrix of choice (array)
    output: return the predicted item row for target user (user-based) or user row for target item (item-based)
    """
    # filter out top n similar users/items
    similarities = sorted(sim_matrix[idx], reverse=True)[1:top_n+1]
    sim_dict = {}
    # loop through all users/items and append their index and similarity value to 'sim_dict'
    for similarity in similarities:
        sim_dict[np.where(sim_matrix[idx] == similarity)[0][0]] = similarity
    # create empty matrix where the prediction vector is going be saved
    if type == 'user':
        result = np.zeros(len(normalized_ui_matrix.columns))
        for user in sim_dict:
            # get scores from user item matrix using index of user
            score = normalized_ui_matrix.loc[user]
            # get weights of user
            weight = sim_dict[user]
            # weighted average for prediction (average is taken at last line)
            result += np.dot(score, weight)
    elif type == "item":
        result = np.zeros(len(normalized_ui_matrix))
        for item in sim_dict:
            # get scores from user item matrix using index of item
            score = normalized_ui_matrix[item]
            # get weights of item
            weight = sim_dict[item]
            # weighted average for prediction (average is taken at last line)
            result += np.dot(score, weight)
    else:
        return "Enter valid memory-based RS type"
    
    return result/sum(sim_dict.values())

In [86]:
def predicted_ui_matrix(type, top_n, sim_matrix):
    pred = []
    if type == 'user':
        for i in range(len(normalized_ui_matrix)):
            pred.append(predict(type, i, top_n, sim_matrix))
        pred = np.array(pred)
        return pred
    elif type == 'item':
        for i in range(len(normalized_ui_matrix.loc[0])):
            pred.append(predict(type, i, top_n, sim_matrix))
        pred = np.array(pred)
        return pred.T
    else:
        return "Enter 'user' for user-based and 'item' for item-based CF RS"

Evaluate

In [5]:
from scipy.stats import spearmanr

def evaluate(type, top_n, sim_matrix):
    pred = predicted_ui_matrix(type, top_n, sim_matrix)
    true = np.array(normalized_ui_matrix.values.tolist())
    # calculate RMSE, MAE, and Spearman correlation
    rmse = np.sqrt(np.mean((pred-true)**2))
    mae = np.mean(abs(pred-true))
    spearman_scores = []
    for i in range(len(true)):
        true_rank = np.argsort(true[i])
        predicted_rank = np.argsort(pred[i])
        spearman_scores.append(spearmanr(true_rank, predicted_rank).correlation)
    average_spearman_score = np.mean(spearman_scores)
    print("RMSE: ", rmse, "\nMAE: ", mae, "\nSpearman: ", average_spearman_score)

In [89]:
print("User-based approach with cosine similarity")
print(evaluate('user', 50, cosine_user_sim), "\n---------")

print("User-based approach with pearson similarity")
print(evaluate('user', 50, pearson_user_sim), "\n---------")

print("Item-based approach with cosine similarity")
print(evaluate('item', 50, cosine_item_sim), "\n---------")

print("Item-based approach with pearson similarity")
print(evaluate('item', 50, pearson_item_sim), "\n---------")

User-based approach with cosine similarity
RMSE:  0.26672632258637224 
MAE:  0.0943938468652809 
Spearman:  0.5938959359206174
None 
---------
User-based approach with pearson similarity
RMSE:  0.2667263225863723 
MAE:  0.09439384686528093 
Spearman:  0.5938959349277827
None 
---------
Item-based approach with cosine similarity
RMSE:  0.28283421661104957 
MAE:  0.09080784956365832 
Spearman:  0.23046860395364357
None 
---------
Item-based approach with pearson similarity
RMSE:  0.2734087079745447 
MAE:  0.08185400557506971 
Spearman:  0.24259847846933547
None 
---------
