In [5]:
import numpy as np
import pandas as pd
from metrics import Evaluator



Here is an example of evaluating validation performance of SASRec with CE, trained 4 epoches on data obtained with leave-last-out splitting strategy (one last item is in validation holdout (ground truth) for each user) for ML-1M.

**Note:**
cases with a fixed multiple number of items in the holdout for each user, cases with a varying number of items in the holdout across users are treated in the same way.

### Obtain data and model scores

In [6]:
DATA_PATH = 'data_ml-1m/'

In [7]:
# Dataframe with training data (should contain columns: col_user, col_item).
training = pd.read_csv(DATA_PATH + 'training.csv')
print(training.shape)
training.head()

(794764, 4)


Unnamed: 0,userid,itemid,rating,timestamp
0,0,1141,4,975768620
1,0,3653,4,975768294
2,0,1160,4,975768106
3,0,1169,5,975768520
4,0,1176,4,975768106


In [8]:
# Dataframe with interaction histories for each holdout user (should contain columns: col_user, col_item). Unique userid should appear in the same order as in scores.
history = pd.read_csv(DATA_PATH + 'history.csv')
print(history.shape)
history.head()

(793852, 4)


Unnamed: 0,userid,itemid,rating,timestamp
0,0,1141,4,975768620
1,0,3621,3,975768440
2,0,3616,4,975768294
3,0,2722,4,975768707
4,0,1109,4,975768520


In [9]:
# Dataframe with holdout data (should contain columns: col_user, col_item, col_rating).
holdout = pd.read_csv(DATA_PATH + 'holdout.csv')
print(holdout.shape)
holdout.head()

(5396, 4)


Unnamed: 0,userid,itemid,rating,timestamp
0,0,2944,4,975768707
1,1,1272,4,975753027
2,2,1260,3,975739614
3,3,3571,3,975732629
4,4,2742,3,975729161


In [10]:
# Model (in our case SASRec) scores for each user for each item in the catalog, [n_holdout_users, n_items]
scores = np.load(DATA_PATH + 'scores.npy')
print(scores.shape)
scores

(5396, 3660)


array([[ 1.591773  ,  0.16419992,  1.3739617 , ...,  0.6172257 ,
        -2.5894756 , -2.7629774 ],
       [ 0.01049731,  2.5119438 ,  1.3940676 , ..., -0.47033605,
        -2.2097208 , -2.289036  ],
       [ 2.9477825 , -0.77587265, -1.700124  , ..., -0.35690895,
        -3.2802098 , -3.0698538 ],
       ...,
       [ 3.8560014 , -0.25954992, -0.43528587, ...,  0.24304622,
        -4.2127867 , -4.351466  ],
       [ 2.4035373 , -1.3335705 , -0.99851793, ...,  1.0821494 ,
        -2.481551  , -2.2282383 ],
       [ 2.2156248 , -0.59569716, -2.0204065 , ..., -0.99313223,
        -2.4140756 , -2.2366862 ]], dtype=float32)

### Evaluate

By default, Evaluator calculates NDCG, HR and COV, but you can pass to Evaluator the metrics you want from [recommenders](https://microsoft-recommenders.readthedocs.io/en/latest/evaluation.html#module-recommenders.evaluation.python_evaluation). For more details on arguments, check [here](https://github.com/dalibra/recsys_evaluator/blob/main/metrics.py).

In [11]:
evaluator = Evaluator(top_k=[5,10,20])

Downvote the scores of items already seen by users in the past.
Makes sense to use it for datasets where items are not repeated within users (e.g., MovieLens).

In [12]:
scores_downvoted = evaluator.downvote_seen_items(scores, history)
scores_downvoted.shape

(5396, 3660)

Get max(top_k) recommendations for each user based on the scores.

In [13]:
recs = evaluator.topk_recommendations(scores_downvoted)
recs.shape

(5396, 20)

Calculate metrics. 

In [14]:
metrics = evaluator.compute_metrics(holdout, recs)
metrics

{'ndcg@5': 0.043143,
 'hr@5': 0.069125,
 'ndcg@10': 0.057066,
 'hr@10': 0.112305,
 'ndcg@20': 0.073271,
 'hr@20': 0.176798}

Pass training set to include diversity metrics.

In [15]:
metrics = evaluator.compute_metrics(holdout, recs, training)
metrics

{'ndcg@5': 0.043143,
 'hr@5': 0.069125,
 'cov@5': 0.109076,
 'ndcg@10': 0.057066,
 'hr@10': 0.112305,
 'cov@10': 0.147348,
 'ndcg@20': 0.073271,
 'hr@20': 0.176798,
 'cov@20': 0.196555}