In [1]:
from collections import namedtuple

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [23]:
from polara.recommender.evaluation import assemble_scoring_matrices, build_rank_matrix, matrix_from_observations, split_positive, generate_hits_data
from polara.recommender.evaluation import get_mrr_score, get_ndcr_discounts, get_ndcg_score, get_ndcl_score
from polara.recommender.evaluation import get_hits, get_relevance_scores, get_ranking_scores
from polara.recommender.evaluation import  _get_hits, _get_relevance_scores, _get_ranking_scores
from polara.datasets.movielens import get_movielens_data
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel

# Simple examples

## from wiki
based on https://en.wikipedia.org/wiki/Discounted_cumulative_gain

In [11]:
swp = None

data = pd.DataFrame({'userid': [0,0,0,0,0,0,0,0],
                     'movieid': [0,1,2,3,4,5,6,7],
                    'rating':[3, 2, 3, 0, 1, 2, 3, 2]})
recs = np.array([[0,1,2,3,4,5]])
hsz = data.shape[0]

In [12]:
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)

In [20]:
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', None, 'rating')

In [33]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)

In [34]:
get_ndcg_score(ehits, discm, idisc, alternative=False)

0.75616402981683351

the result is slightly worse (expected value is 0.785), as normalization is based on the full holdout, not just topk elements  
this is an intentional behavior in order to support NDCL score calculation when switch_positive is set

## hand-crafted example

In [49]:
swp = 3

data = pd.DataFrame({'userid': [0,0, 1,1, 2,2],
                     'movieid': [0,1, 2,3, 4,5],
                    'rating':[2,3, 1,3, 5,4]})
recs = np.array([[1,0], [2,3], [5,4]])
hsz = 2

In [53]:
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)

In [54]:
data.set_index(['userid', 'movieid']).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userid,movieid,Unnamed: 2_level_1
0,0,2
0,1,3
1,2,1
1,3,3
2,4,5
2,5,4


In [55]:
if swp is None:
    is_positive = None
else:
    is_positive = data.rating>=swp

In [56]:
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', is_positive, 'rating')

In [57]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)

In [58]:
get_ndcg_score(ehits, discm, idisc, alternative=False)

0.86062517437112918

In [60]:
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)

0.86165416690705199

# Movielens

In [76]:
ml_data = get_movielens_data('c:/Users/evfro/Downloads/ml-1m.zip')

In [77]:
ml_data.head()

Unnamed: 0,userid,movieid,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [78]:
dm = RecommenderData(ml_data, 'userid', 'movieid', 'rating')

In [79]:
dm.get_configuration()

{'holdout_size': 3,
 'negative_prediction': False,
 'permute_tops': False,
 'random_holdout': False,
 'shuffle_data': False,
 'test_fold': 5,
 'test_ratio': 0.2,
 'test_sample': None,
 'test_unseen_users': True}

In [80]:
dm.random_holdout = True
dm.seed = 0
dm.prepare()

Preparing data...
19 unique movieid's within 26 testset interactions were filtered. Reason: not in the training data.
1 unique movieid's within 1 holdout interactions were filtered. Reason: not in the training data.
1 of 1208 userid's were filtered out from holdout. Reason: not enough items.
1 userid's were filtered out from testset. Reason: inconsistent with holdout.


In [81]:
svd = SVDModel(dm)
svd.rank = 50

In [82]:
svd.build()

PureSVD training time: 0.409694590708s


In [83]:
swp = 4

svd.switch_positive = swp
data = dm.test.evalset
recs = svd.recommendations
hsz = dm.holdout_size

In [84]:
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)

In [89]:
if swp is None:
    is_positive = None
else:
    is_positive = (data.rating>=swp).values

In [92]:
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', is_positive, 'rating')

In [95]:
evalm

<1207x3687 sparse matrix of type '<type 'numpy.int64'>'
	with 3621 stored elements in Compressed Sparse Row format>

In [93]:
ehits

<1207x3687 sparse matrix of type '<type 'numpy.int64'>'
	with 2346 stored elements in Compressed Sparse Row format>

In [96]:
emiss

<1207x3687 sparse matrix of type '<type 'numpy.int64'>'
	with 1275 stored elements in Compressed Sparse Row format>

In [97]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)

In [98]:
discm

<1207x3687 sparse matrix of type '<type 'numpy.float64'>'
	with 12070 stored elements in Compressed Sparse Row format>

In [99]:
idisc

<1207x3687 sparse matrix of type '<type 'numpy.float64'>'
	with 3621 stored elements in Compressed Sparse Row format>

In [100]:
get_ndcg_score(ehits, discm, idisc, alternative=False)

0.1699440242225603

In [101]:
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)

0.064068896990696442

In [102]:
get_mrr_score(hrank)

0.20079365079365077

compare with previous implementation

In [112]:
def get_matched_predictions(eval_data, holdout_size, recs):
    userid, itemid = 'userid', 'movieid'
    holdout_data = eval_data[itemid]
    holdout_matrix = holdout_data.values.reshape(-1, holdout_size).astype(np.int64)

    matched_predictions = (recs[:, :, None] == holdout_matrix[:, None, :])
    return matched_predictions

def get_feedback_data(eval_data, holdout_size):
    feedback = 'rating'
    eval_data = eval_data[feedback].values
    feedback_data = eval_data.reshape(-1, holdout_size)
    return feedback_data

def get_rnkng_scores(eval_data, holdout_size, recs, switch_positive=None, alternative=False):
    matched_predictions = get_matched_predictions(eval_data, holdout_size, recs)
    feedback_data = get_feedback_data(eval_data, holdout_size)
    
    users_num, topk, holdout = matched_predictions.shape
    ideal_scores_idx = np.argsort(feedback_data, axis=1)[:, ::-1] #returns column index only
    ideal_scores_idx = np.ravel_multi_index((np.arange(feedback_data.shape[0])[:, None],
                                             ideal_scores_idx), dims=feedback_data.shape)
        
    where = np.where
    is_positive = feedback_data >= switch_positive
    positive_feedback = where(is_positive, feedback_data, 0)
    negative_feedback = where(~is_positive, feedback_data-switch_positive, 0)
    
    relevance_scores_pos = (matched_predictions * positive_feedback[:, None, :]).sum(axis=2)
    relevance_scores_neg = (matched_predictions * negative_feedback[:, None, :]).sum(axis=2)
    ideal_scores_pos = positive_feedback.ravel()[ideal_scores_idx]
    ideal_scores_neg = negative_feedback.ravel()[ideal_scores_idx]
    
    if alternative:
        relevance_scores_pos = 2**relevance_scores_pos - 1
        relevance_scores_neg = 2.0**relevance_scores_neg - 1
        ideal_scores_pos = 2**ideal_scores_pos - 1
        ideal_scores_neg = 2.0**ideal_scores_neg - 1

    disc_num = max(topk, holdout)
    discount = np.log2(np.arange(2, disc_num+2))            
    dcg = (relevance_scores_pos / discount[:topk]).sum(axis=1)
    dcl = (relevance_scores_neg / -discount[:topk]).sum(axis=1)
    idcg = (ideal_scores_pos / discount[:holdout]).sum(axis=1)
    idcl = (ideal_scores_neg / -discount[:holdout]).sum(axis=1)
    
    with np.errstate(invalid='ignore'):
        ndcg = np.nansum(dcg / idcg) / users_num
        ndcl = np.nansum(dcl / idcl) / users_num

    ranking_score = namedtuple('Ranking', ['nDCG', 'nDCL'])._make([ndcg, ndcl])
    return ranking_score

In [113]:
get_rnkng_scores(data, hsz, recs, switch_positive=swp, alternative=False)

Ranking(nDCG=0.1699440242225603, nDCL=0.064068896990696442)

In [114]:
_get_ranking_scores(rankm, hrank, mrank, evalm, ehits, emiss, switch_positive=swp, topk=topk, alternative=False)

Ranking(nDCG=0.1699440242225603, nDCL=0.064068896990696442)

In [115]:
svd.not_rated_penalty = None

In [116]:
svd.evaluate('hits')

Hits(true_positive=602, true_negative=1143, false_positive=132, false_negative=1744)

In [117]:
svd._evaluate('hits')

Hits(true_positive=602, false_positive=132, true_negative=1143, false_negative=1744)

In [118]:
svd.evaluate('relevance')

Relevance(precision=0.39215686274509803, recall=0.24247445457056063, fallout=0.068903617785142215, specifity=0.6096382214857774, miss_rate=0.68710301021817177)

In [119]:
svd._evaluate('relevance')

Relevance(precision=0.39215686274509803, recall=0.24247445457056063, fallout=0.068903617785142215, specifity=0.6096382214857774, miss_rate=0.68710301021817177)

In [122]:
svd.evaluate('ranking')

Ranking(nDCG=0.1699440242225603, nDCL=0.064068896990696442)

In [123]:
svd._evaluate('ranking')

Ranking(nDCG=0.1699440242225603, nDCL=0.064068896990696442)

In [124]:
svd.evaluate('ranking', topk=1)

Ranking(nDCG=0.073593470418241991, nDCL=0.022039537078199615)

In [125]:
svd._evaluate('ranking', topk=1)

Ranking(nDCG=0.073593470418241977, nDCL=0.022039537078199615)

## Hand-picked test

In [140]:
test_user = 98
test_data = svd.data.test.evalset.query('userid=={}'.format(test_user))
test_recs = svd.recommendations[test_user, :]

In [141]:
topk = len(test_recs)

In [142]:
print test_recs
test_data

[1045 2469 1126 1173 2489  846 2638  524 1130 2553]


Unnamed: 0,userid,movieid,rating
820166,98,1130,5
820164,98,1108,5
820140,98,1045,3


In [143]:
test_data.loc[:, 'movieid'].isin(test_recs)

820166     True
820164    False
820140     True
Name: movieid, dtype: bool

In [144]:
(rankm, hrank, mrank,
 evalm, ehits, emiss) = assemble_scoring_matrices(test_recs, test_data,
                                                  svd._key, svd._target,
                                                  (test_data.rating>=swp).values, feedback='rating')

In [146]:
hrank.data

array([9], dtype=uint8)

In [147]:
hrank.indices

array([1130])

In [148]:
ehits.data

array([5, 5], dtype=int64)

In [149]:
ehits.indices

array([1130, 1108])

In [150]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topn=2)

In [151]:
discm.data

array([ 1.        ,  0.63092975,  0.5       ,  0.43067656,  0.38685281,
        0.35620719,  0.33333333,  0.31546488,  0.30103   ,  0.28906483])

In [152]:
discm.indices

array([1045, 2469, 1126, 1173, 2489,  846, 2638,  524, 1130, 2553])

In [153]:
idisc.data

array([ 1.        ,  0.63092975,  0.5       ])

In [154]:
idisc.indices

array([1108, 1130, 1045])

NDCG

In [155]:
get_ndcg_score(ehits, discm, idisc, alternative=False)

0.18457569677956817

In [158]:
print 'rec rank', np.where(np.isin(test_recs, test_data.movieid))[0] + 1
print 'rec item', test_recs[np.isin(test_recs, test_data.movieid)]

rec rank [1 9]
rec item [1045 1130]


NDCL

In [164]:
emiss.data

array([3], dtype=int64)

In [165]:
emiss.indices

array([1045])

In [166]:
emiss.multiply(discm).data

array([ 3.])

In [167]:
emiss.multiply(idisc)

<1x2639 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [168]:
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)

2.0

# Why normalization in NDCG is changed

basically due to NDCL metric, which is "the lower the better"  
this means that ideal score is 0

regular case

In [46]:
cg = lambda rel, pos: rel / np.log2(1+pos)

print 'dcg ', cg(5, 9)
print 'idcg', cg(5, 1) + cg(5, 2)
print 'ndcg', cg(5, 9) / (cg(5, 1) + cg(5, 2))

dcg  1.50514997832
idcg 8.15464876786
ndcg 0.18457569678


singular, but still ok

In [161]:
cl = lambda rel, pos: (np.exp(rel-4)-1) / (-np.log2(1+pos))

print 'dcl ', 0
print 'idcl', 0
with np.errstate(invalid='ignore'):
    print 'ndcl', np.array([0.]) / np.array([0.])

 dcl  0
idcl 0
ndcl [ nan]


broken case  
when dcl is above zere and idcl is exactly 0 (due to only topk selected result, where negatove examples are not included at all)

In [163]:
cl = lambda rel, pos: (np.exp(rel-4)-1) / (-np.log2(1+pos))

print 'dcl ', cl(3, 3)
print 'idcl', 0
with np.errstate(invalid='ignore'): # will not catch an error
    print 'ndcl', cl(3, 3) / np.array([0.])

dcl  0.316060279414
idcl 0
ndcl [ inf]


  


therefore with standard normalization NDCL may generate inf doesn't make a lot of sense, especially when trying to average across many users