In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import precision_score, recall_score

# Load the data

In [3]:
# Split Movielens 100K data into train and test (80-20)
np.seed = 1
dataset = pd.read_csv("../data/u.data",sep='\t',names="user_id,item_id,rating,timestamp".split(","))
dataset = dataset.iloc[:,:3]
dataset.user_id = dataset.user_id.astype('category').cat.codes.values
dataset.item_id = dataset.item_id.astype('category').cat.codes.values
train, test = train_test_split(dataset, test_size=0.2)

In [4]:
# Check that we have ratings in the train set for all the users in the test set
for test_user in test["user_id"].values:
    if test_user not in train["user_id"].values:
        print("User", test_user, "is in the test set but not in the train set")

# MostPop Recommender

In [19]:
# Get the items with the most interactions (ratings)
item_counts = train.item_id.value_counts()

# define mostpop algorithm
def mostpop(data, k=5):
    mostpop_items = item_counts.index[:k].values.tolist()
    mostpop_topn = {}
    for uid in data.user_id.values:
        mostpop_topn[uid] = mostpop_items
    return mostpop_topn

In [11]:
# Compute similarities

# First create the user-item matrix
unique_users = dataset.user_id.unique()
unique_items = dataset.item_id.unique()
ui_matrix = np.zeros((unique_users.shape[0], unique_items.shape[0]))

# Use train data to build the similarity matrix
for train_row in train.itertuples():
    ui_matrix[train_row.user_id - 1, train_row.item_id - 1] = train_row.rating

print(ui_matrix.shape) # We should have a 943x1682 matrix

# Item-User matrix is the transpose of the User-Item matrix
iu_matrix = ui_matrix.T
print(iu_matrix.shape) # We should have a 1682x943 matrix

(943, 1682)
(1682, 943)


# Evaluation Functions

In [7]:
# First of all construct the dictionary containing the ground truth rankings
test_gt_ranks = {}
for uid in test.user_id.unique():
    user_items = test[test.user_id == uid]
    user_items = user_items[user_items.rating > 3.5]
    user_items = user_items.sort_values(by=["rating"], ascending=False)
    test_gt_ranks[uid] = user_items.item_id.values.tolist()

In [59]:
# Check that we have ratings in the train set for all the users in the test set
def find_items_not_in_trainset(trainset, testset):
    items_not_in_train = []
    for itemId in testset.item_id.values:    
        if itemId not in trainset.item_id.values:
            items_not_in_train.append(itemId)
            
    return items_not_in_train

# This method just gets all the items the user has rated in the trainset        
def user_seen_items(userId):
    return train[train.user_id == userId].item_id.index.values.tolist()

In [None]:
def precision_recall_at_k(predictions, items_not_in_train, k, threshold=3.5):
    top_n_recoms_est = GetTopN(predictions, n=k, minimumRating=threshold, criterion="est")
    top_n_recoms_real = GetTopN(predictions, n=k, minimumRating=threshold, criterion="r_ui")
    above_threshold = predictions[predictions.r_ui >= threshold]

    precisions = {}
    recalls = {}

    for uid, est_topn in top_n_recoms_est.items():
        # Get items the user has already rated
        already_seen = user_seen_items(uid)
        # Get relevant items for the user
        n_rel_for_user = len(above_threshold[above_threshold.uid == uid])        
        tp = 0
        # Penalize the scores if:
        # - The item we are recommending was never seen in the training set (how could we recommend what we don't know?)
        # - The user has already rated this item: It's not a good recommendation since the user already knows it/has seen it
        for est_itemId, _ in est_topn:
            if(est_itemId in items_not_in_train or est_itemId in already_seen):
                tp += 0
            else:
                for real_itemId, _ in top_n_recoms_real[uid]:
                    if (est_itemId == real_itemId):
                        tp +=1
        
        precisions[uid] = tp/k
        recalls[uid] = tp/n_rel_for_user if n_rel_for_user != 0 else 0

    return precisions, recalls

In [80]:
#def calc_precision_recall(predicted, k)
#    total_precision = 0
#    total_recall
#
#    for uid in test.user_id:
#        gt_rank = test_gt_ranks[uid][:k]
#        predicted = mostpop(uid, k)
#        if (len(predicted) > len(gt_rank)):
#            predicted = predicted[:len(gt_rank)]
#
#        total_precisions += precision_score(gt_rank, predicted, average='macro')
#        total_recall += recall_score(gt_rank, predicted, average='macro')
#    
#    precisions = total_precisions/len(train)
#    recall = total_recall/len(train)
#
#    return precisions, recall
#    

  _warn_prf(average, modifier, msg_start, len(result))


In [79]:
#prec_5, rec_5 = calc_precision_recall
#print("Preci")

0.008211845238095088

In [100]:
total_precisions/940

0.014964539007092197

In [35]:
k = 5
precisions = {}
recalls = {}
predicted_topn = mostpop(test, k)

for uid in predicted_topn.keys():
    tp_prec = 0
    tp_rec = 0
    
    gt_rank = test_gt_ranks[uid]
    predicted_topn_rank = predicted_topn[uid]
    
    ## Precision for this user's top-n
    for recom_item in predicted_topn_rank:
        if recom_item in gt_rank[:5]:
            tp_prec += 1
    else:
            tp_prec += 0
    precisions[uid] = tp_prec/k

    ## Recall for this user's top-n
    for recom_item in predicted_topn_rank:
        if recom_item in gt_rank:
            tp_rec += 1
    else:
            tp_rec += 0
    recalls[uid] = tp_rec/len(gt_rank) if len(gt_rank) != 0 else 0
    
    #if (len(predicted) > len(gt_rank)):
    #    predicted = predicted[:len(gt_rank)]
    #total_precisions += precision_score(gt_rank, predicted, average='weighted')
    #total_recalls += recall_score(gt_rank, predicted, average='weighted')

In [37]:
avg_pre = np.array([precisions[k] for k in precisions.keys()]).mean()
avg_rec = np.array([recalls[k] for k in recalls.keys()]).mean()
print(avg_pre, avg_rec)

0.043266171792152704 0.049491027611216674
