In [1]:
import sys
sys.path.append("../")
import os
import cornac
import surprise
import pandas as pd

from surprise.prediction_algorithms import KNNWithMeans
from reco_utils.dataset.python_splitters import python_random_split
from reco_utils.recommender.cornac.cornac_utils import predict_ranking
from reco_utils.recommender.surprise.surprise_utils import predict, compute_ranking_predictions
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from tabulate import tabulate

print("System version: {}".format(sys.version))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.7.5 (default, Nov  1 2019, 02:16:38) 
[Clang 10.0.0 (clang-1000.11.45.5)]
Cornac version: 1.6.1


# Load data

In [2]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
movielens_df = movielens_df.iloc[:,0:3]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
pda_df = pda_df.iloc[:,0:3]
print("ML100K\n", movielens_df.head())
print("\n\n")
print("PDA2018\n", pda_df.head())

ML100K
    userID  itemID  rating
0     196     242       3
1     186     302       3
2      22     377       1
3     244      51       2
4     166     346       1



PDA2018
    userID  itemID  rating
0       5     648       5
1       5    1394       5
2       5    3534       5
3       5     104       4
4       5    2735       5


# Experiment Variables

In [3]:
results_table = [] # list that will store results
kfolds = 5 # nr of k folds  for cross validation
# Seed
SEED = 21

# Model parameters for BPR and NCF
NUM_FACTORS = 200
NUM_EPOCHS = 100
reg_lambda = 0.001
learning_rate = 0.01

# Datasets
datasets = {
    "ML100": movielens_df,
    "PDA2018": pda_df
}

# Build models and evaluate

In [4]:
# Most Popular model
most_pop_model = cornac.models.MostPop()

# BPR model
bpr_model = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=learning_rate,
    lambda_reg=reg_lambda,
    verbose=True
)

# NCF model
ncf_model = cornac.models.NeuMF(
    num_factors=NUM_FACTORS,
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_epochs=NUM_EPOCHS,
    batch_size=256,
    lr=learning_rate,
    reg_mf=reg_lambda,
    seed=SEED,
    verbose=True
)

cornac_models = {
    "MostPop": most_pop_model,
    "BPR": bpr_model,
    "NCF": ncf_model
}

# Run evaluation

In [5]:
for dataset in datasets:
    print("Running eval on the", dataset, "dataset...\n")
    for model in cornac_models:
        
        print("Running 5-fold cross validation with", model, "on", dataset, "dataset ...")
        
        # Run cross validation
        avg_prec_5 = 0
        avg_prec_10 = 0
        avg_rec_5 = 0
        avg_rec_10 = 0
        avg_ndcg_5 = 0
        avg_ndcg_10 = 0
        fold_nr = 0
        for fold in range(kfolds):
            print("\n\nFold #", fold)

            # Train-test split
            train, test = python_random_split(movielens_df, 0.80)
            train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)
            # Print a bit of information of the train set matrix 
            print('Number of users: {}'.format(train_set.num_users))
            print('Number of items: {}'.format(train_set.num_items))

            # Set the current model
            current_model = cornac_models[model]
            current_model.fit(train_set)

            # Predict all unseen entries
            all_predictions = predict_ranking(current_model, train, usercol='userID', itemcol='itemID', remove_seen=True)

            # Evaluation metrics
            avg_prec_5 += precision_at_k(test, all_predictions, col_prediction='prediction', k=5)
            avg_prec_10 += precision_at_k(test, all_predictions, col_prediction='prediction', k=10)

            avg_rec_5 += recall_at_k(test, all_predictions, col_prediction='prediction', k=5)
            avg_rec_10 += recall_at_k(test, all_predictions, col_prediction='prediction', k=10)

            avg_ndcg_5 += ndcg_at_k(test, all_predictions, col_prediction='prediction', k=5)
            avg_ndcg_10 += ndcg_at_k(test, all_predictions, col_prediction='prediction', k=10)
            fold_nr += 1


        new_line = [dataset+"-"+model, avg_prec_5/5, avg_prec_10/5, avg_rec_5/5, \
                    avg_rec_10/5, avg_ndcg_5/5, avg_ndcg_10/5]
        results_table.append(new_line)

        print()

Running eval on the ML100 dataset...

Running 5-fold cross validation with MostPop on ML100 dataset ...


Fold # 0
Number of users: 943
Number of items: 1653


Fold # 1
Number of users: 943
Number of items: 1653


Fold # 2
Number of users: 943
Number of items: 1653


Fold # 3
Number of users: 943
Number of items: 1653


Fold # 4
Number of users: 943
Number of items: 1653

Running 5-fold cross validation with BPR on ML100 dataset ...


Fold # 0
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!


Fold # 1
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!


Fold # 2
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!


Fold # 3
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!


Fold # 4
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!

Running 5-fold cross validation with NCF on ML100 dataset ...


Fold # 0
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Fold # 1
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Fold # 2
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Fold # 3
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Fold # 4
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))



Running eval on the PDA2018 dataset...

Running 5-fold cross validation with MostPop on PDA2018 dataset ...


Fold # 0
Number of users: 943
Number of items: 1653


Fold # 1
Number of users: 943
Number of items: 1653


Fold # 2
Number of users: 943
Number of items: 1653


Fold # 3
Number of users: 943
Number of items: 1653


Fold # 4
Number of users: 943
Number of items: 1653

Running 5-fold cross validation with BPR on PDA2018 dataset ...


Fold # 0
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!


Fold # 1
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!


Fold # 2
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!


Fold # 3
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!


Fold # 4
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!

Running 5-fold cross validation with NCF on PDA2018 dataset ...


Fold # 0
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Fold # 1
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Fold # 2
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Fold # 3
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Fold # 4
Number of users: 943
Number of items: 1653


HBox(children=(FloatProgress(value=0.0), HTML(value='')))





# Results


In [6]:
# Display results of running the algorithms
results_table_headers = ["Recommender", "Pre@5", "Pre@10", "Rec@5", "Rec@10", "NDCG@5", "NDCG@10"]
print(tabulate(results_table, results_table_headers, tablefmt="pipe"))

| Recommender     |    Pre@5 |   Pre@10 |     Rec@5 |   Rec@10 |   NDCG@5 |   NDCG@10 |
|:----------------|---------:|---------:|----------:|---------:|---------:|----------:|
| ML100-MostPop   | 0.21617  | 0.192553 | 0.0706593 | 0.114259 | 0.2305   |  0.220767 |
| ML100-BPR       | 0.363362 | 0.309915 | 0.12789   | 0.209184 | 0.389562 |  0.369578 |
| ML100-NCF       | 0.254    | 0.235766 | 0.0844287 | 0.149687 | 0.265001 |  0.266357 |
| PDA2018-MostPop | 0.21617  | 0.192553 | 0.0706593 | 0.114259 | 0.2305   |  0.220767 |
| PDA2018-BPR     | 0.338213 | 0.288064 | 0.121397  | 0.198045 | 0.361512 |  0.344191 |
| PDA2018-NCF     | 0.253489 | 0.235872 | 0.0846865 | 0.150544 | 0.259739 |  0.263504 |


In [8]:
# Export the results to a csv file
results_df = pd.DataFrame(results_table, columns=["Recommender", "Pre@5", "Pre@10", "Rec@5", "Rec@10", "NDCG@5", "NDCG@10"])
results_df.to_csv("../data/item_recommendation_results.csv")