In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise.model_selection import KFold, cross_validate, train_test_split
from tabulate import tabulate


# Create a KNN Ranking algorithm version

In [15]:
from surprise import AlgoBase
from surprise import Dataset
from six import iteritems
import heapq
from surprise.prediction_algorithms.predictions import PredictionImpossible

class SymmetricAlgo(AlgoBase):
    """
    This is an abstract class aimed to ease the use of symmetric algorithms.
    A symmetric algorithm is an algorithm that can can be based on users or on
    items indifferently. When the algo is user-based x denotes a user and y an item. 
    Else, it's reversed.
    """
    def __init__(self, sim_options={}, verbose=True, **kwargs):

        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)
        self.verbose = verbose

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)

        ub = self.sim_options['user_based']
        self.n_x = self.trainset.n_users if ub else self.trainset.n_items
        self.n_y = self.trainset.n_items if ub else self.trainset.n_users
        self.xr = self.trainset.ur if ub else self.trainset.ir
        self.yr = self.trainset.ir if ub else self.trainset.ur

        return self

    def switch(self, u_stuff, i_stuff):
        """
        Return x_stuff and y_stuff depending on the user_based field.
        """

        if self.sim_options['user_based']:
            return u_stuff, i_stuff
        else:
            return i_stuff, u_stuff


class RankingKNNBasic(SymmetricAlgo):
    def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs):

        SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose,
                               **kwargs)
        self.k = k
        self.min_k = min_k

    def fit(self, trainset):

        SymmetricAlgo.fit(self, trainset)
        self.sim = self.compute_similarities()

        return self

    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')

        x, y = self.switch(u, i)

        neighbors = [(self.sim[x, x2], r) for (x2, r) in self.yr[y]]
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
        
        # compute the sum of similarities
        sum_sim = actual_k = 0
        ratings_from_sim = []
        for (sim, r) in k_neighbors:
            sum_sim += sim
            actual_k += 1
            ratings_from_sim.append(r)

        if actual_k < self.min_k:
            raise PredictionImpossible('Not enough neighbors.')

        est = sum_sim
    
        details = {'actual_k': actual_k}
        return est, details

# Define a way to get precision and recall

In [3]:
from collections import defaultdict

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

# Read and load data

In [4]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
print(movielens_df.head())
print("\n\n")
print(pda_df.head())

   userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



   userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919


In [5]:
# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(movielens_df.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_df.iloc[:,0:3], reader)

# Build full trainsets to print out the data loaded above
mls_train = movielens_dataset.build_full_trainset()
pda_train = pda_dataset.build_full_trainset()

# Print out some basic information about the datasets
print("General information on the training sets we will be using \n")
print("1) Number of items in each dataset", " ML100k:", mls_train.n_items, "PDA:", pda_train.n_items)
print("2) Number of users in each dataset", " ML100k:", mls_train.n_users, "PDA:", pda_train.n_users)
print("3) Number of ratings in each dataset", " ML100k:", mls_train.n_ratings, "PDA:", pda_train.n_ratings)
print("4) Mean rating", " ML100k:", mls_train.global_mean, "PDA:", pda_train.global_mean)

General information on the training sets we will be using 

1) Number of items in each dataset  ML100k: 1682 PDA: 1824
2) Number of users in each dataset  ML100k: 943 PDA: 5690
3) Number of ratings in each dataset  ML100k: 100000 PDA: 470711
4) Mean rating  ML100k: 3.52986 PDA: 3.638361967321775


In [11]:
kf = KFold(5)
algo = RankingKNNBasic(k=80, sim_options={'name': 'cosine', 'user_based': True})
mls_train, mls_test = train_test_split(data=movielens_dataset, test_size=0.2)

algo.fit(mls_train)
predictions = algo.test(mls_test)

Computing the cosine similarity matrix...
Done computing similarity matrix.




In [12]:
predictions

[Prediction(uid=144, iid=327, r_ui=3.0, est=5, details={'actual_k': 80, 'was_impossible': False}),
 Prediction(uid=181, iid=1049, r_ui=1.0, est=5, details={'actual_k': 19, 'was_impossible': False}),
 Prediction(uid=747, iid=494, r_ui=5.0, est=5, details={'actual_k': 39, 'was_impossible': False}),
 Prediction(uid=145, iid=298, r_ui=1.0, est=5, details={'actual_k': 80, 'was_impossible': False}),
 Prediction(uid=363, iid=559, r_ui=3.0, est=5, details={'actual_k': 80, 'was_impossible': False}),
 Prediction(uid=119, iid=1265, r_ui=3.0, est=5, details={'actual_k': 16, 'was_impossible': False}),
 Prediction(uid=406, iid=469, r_ui=4.0, est=5, details={'actual_k': 54, 'was_impossible': False}),
 Prediction(uid=894, iid=289, r_ui=2.0, est=5, details={'actual_k': 80, 'was_impossible': False}),
 Prediction(uid=255, iid=121, r_ui=2.0, est=5, details={'actual_k': 80, 'was_impossible': False}),
 Prediction(uid=535, iid=179, r_ui=4.0, est=5, details={'actual_k': 80, 'was_impossible': False}),
 Predict

In [13]:
pres_5, recs_5 = precision_recall_at_k(predictions, k=5)

In [14]:
pre_5 = np.array([pres_5[k] for k in pres_5.keys()]).mean()
rec_5 = np.array([recs_5[k] for k in recs_5.keys()]).mean()
print(pre_5, rec_5)

0.5737539766702014 0.47020495974858195
