In [1]:
import sys
sys.path.append("../")
import os
import cornac
import surprise
import pandas as pd

from surprise.prediction_algorithms import KNNWithMeans
from reco_utils.dataset.python_splitters import python_random_split
from reco_utils.recommender.cornac.cornac_utils import predict_ranking
from reco_utils.recommender.surprise.surprise_utils import predict, compute_ranking_predictions
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from tabulate import tabulate

print("System version: {}".format(sys.version))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.7.5 (default, Nov  1 2019, 02:16:38) 
[Clang 10.0.0 (clang-1000.11.45.5)]
Cornac version: 1.6.1


# Load data

In [2]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
movielens_df = movielens_df.iloc[:,0:3]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
pda_df = pda_df.iloc[:,0:3]
print("ML100K\n", movielens_df.head())
print("\n\n")
print("PDA2018\n", pda_df.head())

ML100K
    userID  itemID  rating
0     196     242       3
1     186     302       3
2      22     377       1
3     244      51       2
4     166     346       1



PDA2018
    userID  itemID  rating
0       5     648       5
1       5    1394       5
2       5    3534       5
3       5     104       4
4       5    2735       5


# Experiment Variables

In [3]:
results_table = []

# Seed
SEED = 21

# Model parameters for BPR and NCF
NUM_FACTORS = 200
NUM_EPOCHS = 100
reg_lambda = 0.001
learning_rate = 0.01

# Datasets
datasets = {
    "ML100": movielens_df,
    "PDA2018": pda_df
}

# Build models and evaluate

In [4]:
# Most Popular model
most_pop_model = cornac.models.MostPop()

# BPR model
bpr_model = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=learning_rate,
    lambda_reg=reg_lambda,
    verbose=True
)

# NCF model
ncf_model = cornac.models.NeuMF(
    num_factors=NUM_FACTORS,
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_epochs=NUM_EPOCHS,
    batch_size=256,
    lr=learning_rate,
    reg_mf=reg_lambda,
    seed=SEED,
    verbose=True
)

cornac_models = {
    "MostPop": most_pop_model,
    "BPR": bpr_model,
    "NCF": ncf_model
}

# Run evaluation

In [5]:
# Train-test split
train, test = python_random_split(movielens_df, 0.80)
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 943
Number of items: 1653


In [6]:
for dataset in datasets:
    print("Running eval on the", dataset, "dataset...\n")
    for model in cornac_models:
        print(model, "...\n")
        current_model = cornac_models[model]
        current_model.fit(train_set)
        # Predict all unseen entries
        all_predictions = predict_ranking(current_model, train, usercol='userID', itemcol='itemID', remove_seen=True)

        # Evaluate 
        precision_5 = precision_at_k(test, all_predictions, col_prediction='prediction', k=5)
        precision_10 = precision_at_k(test, all_predictions, col_prediction='prediction', k=10)

        recall_5 = recall_at_k(test, all_predictions, col_prediction='prediction', k=5)
        recall_10 = recall_at_k(test, all_predictions, col_prediction='prediction', k=10)

        ndcg_5 = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=5)
        ndcg_10 = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=10)

        new_line = [dataset+"-"+model, precision_5, precision_10, recall_5, recall_10, ndcg_5, ndcg_10]
        results_table.append(new_line)

        print("Prec@5", precision_5, "Prec@10", precision_10, "Recall@5", recall_5, "Recall@10", recall_10, \
             "NDCG@5", ndcg_5, "NDCG@10", ndcg_10)
        print()

Running eval on the ML100 dataset...

MostPop ...

Prec@5 0.21617021276595746 Prec@10 0.19255319148936173 Recall@5 0.07065926711599416 Recall@10 0.11425943901460381 NDCG@5 0.23050040907317546 NDCG@10 0.22076740254314156

BPR ...



HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!
Prec@5 0.3759574468085106 Prec@10 0.3232978723404255 Recall@5 0.11984306989572834 Recall@10 0.19529138286400322 NDCG@5 0.40038777517822666 NDCG@10 0.37812153785448827

NCF ...



HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Prec@5 0.2542553191489362 Prec@10 0.2372340425531915 Recall@5 0.08398726542161752 Recall@10 0.15225814409205193 NDCG@5 0.2690760884154233 NDCG@10 0.2702837792958376

Running eval on the PDA2018 dataset...

MostPop ...

Prec@5 0.21617021276595746 Prec@10 0.19255319148936173 Recall@5 0.07065926711599416 Recall@10 0.11425943901460381 NDCG@5 0.23050040907317546 NDCG@10 0.22076740254314156

BPR ...



HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Optimization finished!
Prec@5 0.38574468085106384 Prec@10 0.3337234042553192 Recall@5 0.13368465768819618 Recall@10 0.22161355205856664 NDCG@5 0.41098178715094263 NDCG@10 0.3931232445392719

NCF ...



HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Prec@5 0.2546808510638298 Prec@10 0.23882978723404255 Recall@5 0.08454772647979034 Recall@10 0.1508863193874161 NDCG@5 0.26524391856134905 NDCG@10 0.268098474553774



# Results


In [7]:
# Display results of running the algorithms
results_table_headers = ["Recommender", "Pre@5", "Pre@10", "Rec@5", "Rec@10", "NDCG@5", "NDCG@10"]
print(tabulate(results_table, results_table_headers, tablefmt="pipe"))

| Recommender     |    Pre@5 |   Pre@10 |     Rec@5 |   Rec@10 |   NDCG@5 |   NDCG@10 |
|:----------------|---------:|---------:|----------:|---------:|---------:|----------:|
| ML100-MostPop   | 0.21617  | 0.192553 | 0.0706593 | 0.114259 | 0.2305   |  0.220767 |
| ML100-BPR       | 0.375957 | 0.323298 | 0.119843  | 0.195291 | 0.400388 |  0.378122 |
| ML100-NCF       | 0.254255 | 0.237234 | 0.0839873 | 0.152258 | 0.269076 |  0.270284 |
| PDA2018-MostPop | 0.21617  | 0.192553 | 0.0706593 | 0.114259 | 0.2305   |  0.220767 |
| PDA2018-BPR     | 0.385745 | 0.333723 | 0.133685  | 0.221614 | 0.410982 |  0.393123 |
| PDA2018-NCF     | 0.254681 | 0.23883  | 0.0845477 | 0.150886 | 0.265244 |  0.268098 |


In [9]:
# Export the results to a csv file
results_df = pd.DataFrame(results_table, columns=["Recommender", "Pre@5", "Pre@10", "Rec@5", "Rec@10", "NDCG@5", "NDCG@10"])
results_df.to_csv("../data/item_recommendation_results.csv")