In [1]:
import sys
sys.path.append("../")
import pandas as pd
import numpy as np
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset, Reader
from surprise.model_selection import KFold, cross_validate
from tabulate import tabulate
from collections import defaultdict

# Part 1: Using Surprise

## Read and load the data using Surprise

In [2]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
print("ML100K\n", movielens_df.head())
print("\n\n")
print("PDA2018\n", pda_df.head())

# Sample the data such that every user has rated at least 10 items and every item has been by at least 10 users
print("Shapes before sub-sampling:")
print(movielens_df.shape)
print(pda_df.shape)

# Movielens users all have at least 20 ratings so no need to subsample the user values
ml_subsampled = movielens_df[movielens_df['itemID'].isin(movielens_df['itemID'].value_counts()[movielens_df['itemID'].value_counts()>10].index)]
pda_subsampled = pda_df[pda_df['itemID'].isin(pda_df['itemID'].value_counts()[pda_df['itemID'].value_counts()>10].index)]
pda_subsampled = pda_subsampled[pda_subsampled['userID'].isin(pda_subsampled['userID'].value_counts()[pda_subsampled['userID'].value_counts()>10].index)]
print("\nShapes after sub-sampling:")
print(ml_subsampled.shape)
print(pda_subsampled.shape)

# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(movielens_df.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_df.iloc[:,0:3], reader)

# Build full trainsets to print out the data loaded above
mls_train = movielens_dataset.build_full_trainset()
pda_train = pda_dataset.build_full_trainset()

# Print out some basic information about the datasets
print("\n Some general information on the training sets we will be using:")
print("1) Number of items in each dataset", " ML100k:", mls_train.n_items, "PDA:", pda_train.n_items)
print("2) Number of users in each dataset", " ML100k:", mls_train.n_users, "PDA:", pda_train.n_users)
print("3) Number of ratings in each dataset", " ML100k:", mls_train.n_ratings, "PDA:", pda_train.n_ratings)
print("4) Mean rating", " ML100k:", mls_train.global_mean, "PDA:", pda_train.global_mean)

ML100K
    userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



PDA2018
    userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919
Shapes before sub-sampling:
(100000, 4)
(470711, 4)

Shapes after sub-sampling:
(97623, 4)
(465154, 4)

 Some general information on the training sets we will be using:
1) Number of items in each dataset  ML100k: 1682 PDA: 1824
2) Number of users in each dataset  ML100k: 943 PDA: 5690
3) Number of ratings in each dataset  ML100k: 100000 PDA: 470711
4) Mean rating  ML100k: 3.52986 PDA: 3.638361967321775


## Surprise experiment parameters and variables


### Define auxilliary functions to get Top-N and then calculate metrics

In [3]:
# Check that we have ratings in the train set for all the users in the test set
def find_items_not_in_trainset(trainset, testset):
    items_not_in_train = []
    for _,itemId, _ in testset:    
        if itemId not in trainset.ir.keys():
            items_not_in_train.append(itemId)
            
    return items_not_in_train

        
def user_seen_items(userId):
    return [train_itemId for train_itemId, rating in mls_train.ur[userId]]

In [4]:
## Get top-n from a dataframe
def get_top_n_df(predictions, n, minimumRating, criterion):
    topN = defaultdict(list)
    
    for index, row in predictions.iterrows():
        if (row[criterion] >= minimumRating):
            topN[int(row.uid)].append((int(row.iid), row[criterion]))

    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]

    return topN

In [5]:
# get top-n from the crude surprise predictions object
def get_top_n_dict(predictions, n, minimumRating):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [6]:
def precision_recall_at_k(predictions, items_not_in_train, k, threshold=3.5):
    top_n_recoms_est = get_top_n_df(predictions, n=k, minimumRating=threshold, criterion="est")
    top_n_recoms_real = get_top_n_df(predictions, n=k, minimumRating=threshold, criterion="r_ui")
    above_threshold = predictions[predictions.r_ui >= threshold]

    precisions = {}
    recalls = {}

    for uid, est_topn in top_n_recoms_est.items():
        # Get items the user has already rated
        already_seen = user_seen_items(uid)
        # Get relevant items for the user
        n_rel_for_user = len(above_threshold[above_threshold.uid == uid])        
        tp = 0
        # Penalize the scores if:
        # - The item we are recommending was never seen in the training set (how could we recommend what we don't know?)
        # - The user has already rated this item: It's not a good recommendation since the user already knows it/has seen it
        for est_itemId, _ in est_topn:
            if(est_itemId in items_not_in_train or est_itemId in already_seen):
                tp += 0
            else:
                for real_itemId, _ in top_n_recoms_real[uid]:
                    if (est_itemId == real_itemId):
                        tp +=1
        
        precisions[uid] = tp/k
        recalls[uid] = tp/n_rel_for_user if n_rel_for_user != 0 else 0

    return precisions, recalls

In [7]:
def ndcg_for_rec(predictions, items_not_in_train, k, threshold=3.5):
    top_n_recoms_est = get_top_n_dict(predictions, n=k, minimumRating=threshold)
    
    user_ratings = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_ratings[uid].append((iid,true_r,est))    
        
    ndcgs = defaultdict()
    
    # First compute the idealized DCG, meaning that the ranking is perfect
    idcg = 1 + (sum([1/np.log2(i) for i in range(2, k+1)]))
    
    for user, ratings in user_ratings.items():    
        already_seen = user_seen_items(uid)

        rel_for_user = [iid for iid, true_r, est in ratings if true_r >= threshold]
        
        if len(rel_for_user) == 0:
            ndcgs[user] = 0
            continue # if there no relevant items for this user then proceed to the next one
        
        dcg = []
        for idx, item in enumerate(top_n_recoms_est[user]):
            iid, est = item[0], item[1]
            if(iid in items_not_in_train or iid in already_seen):
                dcg.append(0)
            else:    
                if iid in rel_for_user and est >= threshold:
                    if idx == 0:
                        dcg.append(1)
                    else:
                        dcg.append(1/np.log2(idx+1))

        ndcgs[user] = sum(dcg)/idcg

    return ndcgs

### Run 5-fold cross validation

In [8]:
""" EXPERIMENT VARIABLES"""
# List that will contain the results of all experiments
results_table = []
kf = KFold(n_splits=5) # define number of k splits for cross validation using Surprise KFold

# Algorithms we will be using in this section
algorithms = {
    "UserKNN": KNNBasic(k=80, sim_options={'name': 'cosine', 'user_based': True}),
    "ItemKNN": KNNBasic(k=80, sim_options={'name': 'cosine', 'user_based': False}),
}
# Datasets
datasets = {
    "ML100": movielens_dataset,
    "PDA2018": pda_dataset
}

In [9]:
for dataset in datasets.keys():
    for algorithm in algorithms.keys():
        print("Running 5-fold cross validation with", algorithm, "on", dataset, "dataset ...")
        
        # Run cross validation
        best_prec_5 = 0
        best_prec_10 = 0
        best_rec_5 = 0
        best_rec_10 = 0
        best_ndcg_5 = 0
        best_ndcg_10 = 0
        fold_nr = 0
        
        for trainset, testset in kf.split(movielens_dataset):
            print("Fold  number", fold_nr+1)
            algorithms[algorithm].fit(trainset)
            predictions = algorithms[algorithm].test(testset)            
            predictions_df = pd.DataFrame(predictions)
            predictions_df = predictions_df.iloc[:,:-1]
            items_not_in_trainset = find_items_not_in_trainset(trainset, testset)
            pres_at_5, recalls_at_5 = precision_recall_at_k(predictions_df, items_not_in_trainset, k=5)
            pres_at_10, recalls_at_10 = precision_recall_at_k(predictions_df, items_not_in_trainset, k=10)
            ndcgs_5 = ndcg_for_rec(predictions, items_not_in_trainset, 5)
            ndcgs_10 = ndcg_for_rec(predictions, items_not_in_trainset, 10)
            avg_pre_5 = np.array([pres_at_5[k] for k in pres_at_5.keys()]).mean()
            avg_rec_5 = np.array([recalls_at_5[k] for k in recalls_at_5.keys()]).mean()
            avg_pre_10 = np.array([pres_at_10[k] for k in pres_at_10.keys()]).mean()
            avg_rec_10 = np.array([recalls_at_10[k] for k in recalls_at_10.keys()]).mean()
            avg_ndcg_5 = np.array([ndcgs_5[k] for k in ndcgs_5.keys()]).mean()
            avg_ndcg_10 = np.array([ndcgs_10[k] for k in ndcgs_10.keys()]).mean()
            if (avg_pre_5 > best_prec_5):
                best_prec_5 = avg_pre_5
            if (avg_pre_10 > best_prec_10):
                best_prec_10 = avg_pre_10
            if (avg_rec_5 > best_rec_5):
                best_rec_5 = avg_rec_5
            if (avg_rec_10 > best_rec_10):
                best_rec_10 = avg_rec_10
            if (avg_ndcg_5 > best_ndcg_5):
                best_ndcg_5 = avg_ndcg_5
            if (avg_ndcg_10 > best_ndcg_10):
                best_ndcg_10 = avg_ndcg_5
            fold_nr += 1
        
        
        new_line = [dataset+"-"+algorithm, best_prec_5, best_prec_10, best_rec_5, best_rec_10, best_ndcg_5, best_ndcg_10]
        results_table.append(new_line)
        print()

Running 5-fold cross validation with UserKNN on ML100 dataset ...
Fold  number 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
Fold  number 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
Fold  number 3
Computing the cosine similarity matrix...
Done computing similarity matrix.
Fold  number 4
Computing the cosine similarity matrix...
Done computing similarity matrix.
Fold  number 5
Computing the cosine similarity matrix...
Done computing similarity matrix.

Running 5-fold cross validation with ItemKNN on ML100 dataset ...
Fold  number 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
Fold  number 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
Fold  number 3
Computing the cosine similarity matrix...
Done computing similarity matrix.
Fold  number 4
Computing the cosine similarity matrix...
Done computing similarity matrix.
Fold  number 5
Computing the cosine similarity m

# Results

In [10]:
# Display results of running the algorithms
results_table_headers = ["Recommender", "Pre@5", "Pre@10", "Rec@5", "Rec@10", "NDCG@5", "NDCG@10"]
print(tabulate(results_table, results_table_headers, tablefmt="pipe"))

| Recommender     |    Pre@5 |   Pre@10 |    Rec@5 |   Rec@10 |   NDCG@5 |   NDCG@10 |
|:----------------|---------:|---------:|---------:|---------:|---------:|----------:|
| ML100-UserKNN   | 0.353187 | 0.34989  | 0.316283 | 0.463249 | 0.678307 |  0.640064 |
| ML100-ItemKNN   | 0.341703 | 0.385859 | 0.322389 | 0.515305 | 0.501535 |  0.481618 |
| PDA2018-UserKNN | 0.345692 | 0.346674 | 0.314428 | 0.463595 | 0.659795 |  0.659795 |
| PDA2018-ItemKNN | 0.336127 | 0.379209 | 0.312074 | 0.505894 | 0.501431 |  0.490198 |


In [11]:
# Export the results to a csv file
results_df = pd.DataFrame(results_table, columns=["Recommender", "Pre@5", "Pre@10", "Rec@5", "Rec@10", "NDCG@5", "NDCG@10"])
results_df.to_csv("../data/knn_recommendation_results.csv")