# Bayesian Personalized Ranking (BPR)

In [2]:
import sys
import cornac
import pandas as pd
import numpy as np

from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k, diversity, novelty
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")

  from .autonotebook import tqdm as notebook_tqdm


System version: 3.9.20 (main, Oct  3 2024, 07:27:41) 
[GCC 11.2.0]
Cornac version: 1.18


## Variables

In [3]:
# top k items to recommend
TOP_K = 50

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

# Column names for the dataset
COL_USER = "user_id"
COL_TRACK = "track_id"
COL_COUNT = "playcount"

## Load and split data

### Load data

In [None]:
# Read from file
test_listening_history = pd.read_csv(header=0, delimiter="\t", filepath_or_buffer="../remappings/data/dataset/test_listening_history_OverEqual_50_Interactions.txt")
train_listening_history = pd.read_csv(header=0, delimiter="\t", filepath_or_buffer="../remappings/data/dataset/train_listening_history_OverEqual_50_Interactions.txt")

# Change columns to correct place (user_id, track_id, playcount)
track_test = test_listening_history["track_id"]
user_test = test_listening_history["user_id"]

track_train = train_listening_history["track_id"]
user_train = train_listening_history["user_id"]

test_listening_history["track_id"] = user_test
test_listening_history["user_id"] = track_test

train_listening_history["track_id"] = user_train
train_listening_history["user_id"] = track_train

test_listening_history.columns = [COL_USER, COL_TRACK, COL_COUNT]
train_listening_history.columns = [COL_USER, COL_TRACK, COL_COUNT]

         track_id  user_id  playcount
0            3361       11          2
1           15358       11          4
2           17090       11          1
3           11236       11          3
4            9645       11          1
...           ...      ...        ...
1436854        43   962033          1
1436855      1876   962033          4
1436856      6382   962033          2
1436857      2126   962033          1
1436858       323   962033          4

[1436859 rows x 3 columns]


### Split data

In [5]:
train, test = train_listening_history, test_listening_history

## Build a Cornac Dataset

To work with models implemented in Cornac, we need to construct an object from [Dataset](https://cornac.readthedocs.io/en/latest/data.html#module-cornac.data.dataset) class.

Dataset Class in Cornac serves as the main object that the models will interact with.  In addition to data transformations, Dataset provides a bunch of useful iterators for looping through the data, as well as supporting different negative sampling techniques.

In [6]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

KeyboardInterrupt: 

## Train the BPR model

In [None]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED)

In [None]:
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

## Prediction and Evaluation

In [None]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol=COL_USER, itemcol=COL_TRACK, remove_seen=True)
print("Took {} seconds for prediction.".format(t))

all_predictions.head()

In [None]:
# Sort by 'user' and 'prediction' in descending order
all_prediction_sorted = all_predictions.sort_values(by=[COL_USER, 'prediction'], ascending=[True, False])

# Select the top k predictions for each user
top_k_rec = all_prediction_sorted.groupby(COL_USER).head(TOP_K)

In [None]:
eval_map = map(test, top_k_rec, 
               col_user=COL_USER, 
               col_item=COL_TRACK, 
               col_prediction='prediction', 
               k=TOP_K,
               relevancy_method=None)

eval_ndcg = ndcg_at_k(test, top_k_rec, 
                      col_user=COL_USER, 
                      col_item=COL_TRACK, 
                      col_rating=COL_COUNT, 
                      col_prediction='prediction', 
                      k=TOP_K, 
                      relevancy_method=None)

eval_precision = precision_at_k(test, top_k_rec, 
                                col_user=COL_USER, 
                                col_item=COL_TRACK, 
                                col_prediction='prediction', 
                                k=TOP_K, 
                                relevancy_method=None)

eval_recall = recall_at_k(test, top_k_rec, 
                          col_user=COL_USER, 
                          col_item=COL_TRACK, 
                          col_prediction='prediction', 
                          k=TOP_K, 
                          relevancy_method=None)

eval_diversity = diversity(train_df=train,
                           reco_df=top_k_rec,
                           col_user=COL_USER,
                           col_item=COL_TRACK)

eval_novelty = novelty(train_df=train,
                       reco_df=top_k_rec,
                       col_user=COL_USER,
                       col_item=COL_TRACK)

# Print evaluation metrics, including diversity
print("Precision@K Spark:\t%f" % eval_precision,
      "Recall@K Spark:\t%f" % eval_recall,
      "NDCG Spark:\t%f" % eval_ndcg,
      "Diversity Spark:\t%f" % eval_diversity,
      "Novelty Spark:\t%f" % eval_novelty, sep='\n')