# Bayesian Personalized Ranking (BPR)

In [1]:
import os
import sys
import cornac
import pandas as pd
import numpy as np


import pyspark.sql.functions as F
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k, diversity, novelty, serendipity, catalog_coverage, distributional_coverage
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.utils.notebook_utils import store_metadata

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")

# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = start_or_get_spark("ALS PySpark", memory="16g", config={'spark.local.dir': "/home/manuel-albino/spark-temp", 'spark.cleaner.ttl': "true"})
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

spark.catalog.clearCache()

  from .autonotebook import tqdm as notebook_tqdm


System version: 3.9.20 (main, Oct  3 2024, 07:27:41) 
[GCC 11.2.0]
Cornac version: 1.18


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/06 23:10:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/06 23:10:43 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


## Variables

In [2]:
# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

# Column names for the dataset
COL_USER = "user_id"
COL_TRACK = "track_id"
COL_COUNT = "playcount"

## Load and split data

### Load data

In [None]:

song_ratings = pd.read_csv(header=0, delimiter="\t", filepath_or_buffer="../remappings/data/Modified_Listening_History.txt")


track = song_ratings[COL_TRACK]
user = song_ratings[COL_USER]

song_ratings[COL_TRACK] = user
song_ratings[COL_USER] = track

song_ratings.columns = [COL_USER, COL_TRACK, COL_COUNT]

data = song_ratings.sample(frac= 0.001, replace=False, random_state=0)

data.head()

Unnamed: 0,user_id,track_id,playcount
4880322,484292,2402,1
8179472,809924,13210,1
8199006,811832,32488,1
5425022,537495,13184,1
8958665,887573,16975,1


### Split data

In [4]:
train, test = python_random_split(data, 0.75)

# Set the alpha value for the confidence transformation
alpha = 1

# Transform playcount to confidence in the training data only
train["confidence"] = 1 + alpha * np.log(1 + train[COL_COUNT])

## Build a Cornac Dataset

To work with models implemented in Cornac, we need to construct an object from [Dataset](https://cornac.readthedocs.io/en/latest/data.html#module-cornac.data.dataset) class.

Dataset Class in Cornac serves as the main object that the models will interact with.  In addition to data transformations, Dataset provides a bunch of useful iterators for looping through the data, as well as supporting different negative sampling techniques.

In [5]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 7194
Number of items: 4184


## Train the BPR model

In [6]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [7]:
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

100%|██████████| 100/100 [00:00<00:00, 230.86it/s, correct=66.09%, skipped=0.01%]

Optimization finished!
Took 0.4739 seconds for training.





## Prediction and Evaluation

In [8]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol=COL_USER, itemcol=COL_TRACK, remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 28.5274 seconds for prediction.


In [9]:
# Sort by 'user_id' and 'prediction' in descending order
all_prediction_sorted = all_predictions.sort_values(by=['user_id', 'prediction'], ascending=[True, False])

# Select the top k predictions for each user
top_k_rec = all_prediction_sorted.groupby('user_id').head(TOP_K)

In [None]:
k = 10
eval_map = map(test, all_predictions, col_user=COL_USER, col_item=COL_TRACK, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_user=COL_USER, col_item=COL_TRACK, col_rating=COL_COUNT, col_prediction='prediction', k=k)
eval_precision = precision_at_k(test, all_predictions, col_user=COL_USER, col_item=COL_TRACK, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_user=COL_USER, col_item=COL_TRACK, col_prediction='prediction', k=k)
eval_diversity = diversity(
    train_df=train,
    reco_df=top_k_rec,
    col_user=COL_USER,
    col_item=COL_TRACK
)
eval_novelty = novelty(
    train_df=train,
    reco_df=top_k_rec,
    col_user=COL_USER,
    col_item=COL_TRACK
)
# missing serendipity, catalog_coverage and distributional_coverage to be equal to the als metrics
# may be incorrect

# Print evaluation metrics, including diversity
print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall,
      "Diversity:\t%f" % eval_diversity,
      "Novelty:\t%f" % eval_novelty, sep='\n')

  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  item_pair_sim[col_sim].fillna(0, inplace=True)
  avg_diversity = df_user_diversity.agg({"user_diversity": "mean"})[0]
  avg_novelty = reco_item_novelty.agg({"product": "sum"})[0] / n_recommendations


MAP:	0.004630
NDCG:	0.000178
Precision@K:	0.001852
Recall@K:	0.018519
Diversity:	1.000000
Novelty:	8.181960


In [11]:
spark.stop()