In [1]:
import pandas as pd
import numpy as np
from surprise import BaselineOnly, NormalPredictor
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV, KFold, cross_validate, train_test_split
from tabulate import tabulate

# Create Surprise algorithms for the User, Item and Global Mean predictors

In [2]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise.prediction_algorithms.algo_base import AlgoBase

# Global mean baseline
class GlobalMeanBaseline(AlgoBase):
    
    def __init__(self, verbose=True):
        AlgoBase.__init__(self)
        self.verbose = verbose

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        return self

    def estimate(self, u, i):
        est = self.trainset.global_mean
        return est
    
# User mean baseline
class UserMeanBaseline(AlgoBase):

    def __init__(self, verbose=True):
        AlgoBase.__init__(self)
        self.verbose = verbose
       
    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        # Calculate all the mean rating of each user 
        user_ratings = mls_train.ur
        mean_user_ratings = {}
        for user_id in user_ratings.keys():
            current_user_total_rating = 0
            for item_rating_tuple in user_ratings[user_id]:
                current_user_total_rating += item_rating_tuple[1]

            mean_user_rating = current_user_total_rating/len(user_ratings[user_id])
            mean_user_ratings[user_id] = mean_user_rating

        self.mean_user_ratings = mean_user_ratings
        return self

    def estimate(self, u, i):
        est = self.mean_user_ratings.get(u, self.trainset.global_mean)
        return est
    
# Item mean baseline
class ItemMeanBaseline(AlgoBase):

    def __init__(self, verbose=True):
        AlgoBase.__init__(self)
        self.verbose = verbose

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        item_ratings = mls_train.ir
        mean_item_ratings = {}
        for item_id in item_ratings.keys():
            current_item_total_rating = 0
            for user_rating_tuple in item_ratings[item_id]:
                current_item_total_rating += user_rating_tuple[1]

            mean_item_rating = current_item_total_rating/len(item_ratings[item_id])
            mean_item_ratings[item_id] = mean_item_rating

        self.mean_item_ratings = mean_item_ratings
        return self

    def estimate(self, u, i):
        # Cases for Unknown Items (i starts with UKN)
        if(isinstance(i, str)):
            est = 0
        else:
            est = self.mean_item_ratings.get(u, self.trainset.global_mean)

        return est

# Read and load data

In [3]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
print(movielens_df.head())
print("\n\n")
print(pda_df.head())

   userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



   userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919


In [4]:
# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(movielens_df.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_df.iloc[:,0:3], reader)

# Build full trainsets to print out the data loaded above
mls_train = movielens_dataset.build_full_trainset()
pda_train = pda_dataset.build_full_trainset()

# Print out some basic information about the datasets
print("General information on the training sets we will be using \n")
print("1) Number of items in each dataset", " ML100k:", mls_train.n_items, "PDA:", pda_train.n_items)
print("2) Number of users in each dataset", " ML100k:", mls_train.n_users, "PDA:", pda_train.n_users)
print("3) Number of ratings in each dataset", " ML100k:", mls_train.n_ratings, "PDA:", pda_train.n_ratings)
print("4) Mean rating", " ML100k:", mls_train.global_mean, "PDA:", pda_train.global_mean)

General information on the training sets we will be using 

1) Number of items in each dataset  ML100k: 1682 PDA: 1824
2) Number of users in each dataset  ML100k: 943 PDA: 5690
3) Number of ratings in each dataset  ML100k: 100000 PDA: 470711
4) Mean rating  ML100k: 3.52986 PDA: 3.638361967321775


# Cross Validation of Basic Algorithms

In [5]:
""" Experiment parameters and variables section """
# List that will contain the RMSE and MAE results
results_table = []
kf = KFold(n_splits=5) # define number of k splits for cross validation
# Algorithms we will be using in this section
algorithms = {
    "Random": NormalPredictor, 
    "GlobalMean": GlobalMeanBaseline, 
    "UserMean": UserMeanBaseline, 
    "ItemMean": ItemMeanBaseline
}
# Datasets
datasets = {
    "ML100": movielens_dataset,
    "PDA2018": pda_dataset
}

In [6]:
for dataset in datasets.keys():
    for algorithm in algorithms.keys():
        print("Running 5-fold cross validation with", algorithm, "on", dataset, "dataset ...")
        out = cross_validate(algorithms[algorithm](), datasets[dataset], ["rmse", "mae"], kf)
        mean_rmse = "{:.3f}.".format(np.mean(out["test_rmse"]))
        mean_mae = "{:.3f}.".format(np.mean(out["test_mae"]))
        new_line = [dataset+"-"+algorithm, mean_rmse, mean_mae]
        results_table.append(new_line)

Running 5-fold cross validation with Random on ML100 dataset ...
Running 5-fold cross validation with GlobalMean on ML100 dataset ...
Running 5-fold cross validation with UserMean on ML100 dataset ...
Running 5-fold cross validation with ItemMean on ML100 dataset ...
Running 5-fold cross validation with Random on PDA2018 dataset ...
Running 5-fold cross validation with GlobalMean on PDA2018 dataset ...
Running 5-fold cross validation with UserMean on PDA2018 dataset ...
Running 5-fold cross validation with ItemMean on PDA2018 dataset ...


# Results

In [7]:
# Display results of running the algorithms
results_table_headers = ["Recommender", "RMSE", "MAE"]
print(tabulate(results_table, results_table_headers, tablefmt="pipe"))

| Recommender        | RMSE   | MAE    |
|:-------------------|:-------|:-------|
| ML100-Random       | 1.519. | 1.219. |
| ML100-GlobalMean   | 1.126. | 0.945. |
| ML100-UserMean     | 1.208. | 0.972. |
| ML100-ItemMean     | 1.278. | 1.035. |
| PDA2018-Random     | 1.483. | 1.186. |
| PDA2018-GlobalMean | 1.100. | 0.911. |
| PDA2018-UserMean   | 1.136. | 0.931. |
| PDA2018-ItemMean   | 1.290. | 1.045. |


In [8]:
# Export the results to a csv file
results_df = pd.DataFrame(results_table, columns=["Recommender", "RMSE", "MAE"])
results_df.to_csv("../data/basic_algorithms_results.csv")