In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import seaborn as sns
matplotlib.rcParams['figure.figsize'] = [16.0, 10.0]

In [2]:
from itertools import cycle, islice

styles = list(islice(cycle(['X-', '<-', '.-', 'v-', '*-', 'o-', '>-', '^-', 'd-', 's-', '+-']), 100))
markers = list(islice(cycle(['X', '<', '.', 'v', '*', 'o', '>', '^', 'd', 's', '+']), 100))

def get_markers(n):
    return markers[:n]

In [3]:
df1M = pd.read_csv("data/results-all-ml-1M-pop.csv", index_col=('Percentage', 'Recommender'))
dfLT = pd.read_csv("data/results-all-libraryThing-pop.csv", index_col=('Percentage', 'Recommender'))
dfBA = pd.read_csv("data/results-all-beerAdvocate-pop.csv", index_col=('Percentage', 'Recommender'))

In [4]:
import re


def without_cutoff(metric):
    return re.fullmatch('.*_[0-9]+', metric) is None

def computeTau(df, metrics=None):
    percentages = df.index.levels[0]
    if metrics is None:
        metrics = list(filter(without_cutoff, df.columns))
    tau = pd.DataFrame(0.0, columns=metrics, index=percentages)

    for percentage in percentages:
        # Reference data
        refdf = df.loc[1.00]

        # Comparison data
        mydf = df.loc[percentage]

        for metric in metrics:
            # Compute Kendall for each metric
            tau[metric][percentage] = mydf[metric].corr(refdf[metric], method="kendall")

    return tau

In [5]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages


def plot_in_pdf(filename, titles, dfs, ylabel, metrics=None, xlim=[1.0, 0.8], ylim=[0.0, 1.0]):
    with PdfPages(filename) as pdf:
        for title, df in zip(titles, dfs):
            ax = df.plot(colormap='Set1', style=styles)
            ax.invert_xaxis()
            ax.legend(bbox_to_anchor=(1.02, 1), loc=2)
            ax.set_title(title)
            ax.set_ylabel(ylabel)
            ax.set_xlim(xlim)
            ax.set_ylim(ylim)
            pdf.savefig(bbox_inches='tight')
            plt.close()

In [6]:
from xlsxwriter.utility import xl_col_to_name


def write_excel(filename, dfs, names):
    with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
        for df, name in zip(dfs, names):
            df.to_excel(writer, sheet_name=name)
            n_percentages = len(df.index)
            count = 1
            worksheet = writer.sheets[name]
            median = df.median().mean()
            minimum = df.min().min()
            for column in df.columns:
                fmt = "{c}2:{c}{n}".format(c=xl_col_to_name(count), n=n_percentages + 1)
                count += 1
                worksheet.conditional_format(fmt, {'type': '3_color_scale',
                                                   'min_type': 'num',
                                                   'mid_type': 'num',
                                                   'max_type': 'num',
                                                   'min_value': minimum,
                                                   'mid_value': median,
                                                   'max_value': 1.0})

In [7]:
from multiprocessing import Pool


dfs = [df1M, dfLT, dfBA]
    
titles = ['MovieLens 1M (pop qrels)', 'LibraryThing (pop qrels)', 'BeerAdvocate (pop qrels)']
names = ['MovieLens 1M', 'LibraryThing', 'BeerAdvocate']
metrics = ['bpref', 'err', 'infAP2', 'map', 'ndcg', 'qm', 'recip_rank', 'set_F', 'set_P', 'set_recall']


with Pool() as pool:
    taus = pool.starmap(computeTau, [(df, metrics) for df in dfs])

plot_in_pdf('results/pop/pop_kendall.pdf', titles, taus, "Kendall's tau")
write_excel('results/pop/pop_kendall.xlsx', taus, names)

In [8]:
from multiprocessing import Pool


dfs = [df1M, dfLT, dfBA]
titles = ['MovieLens 1M (pop qrels)', 'LibraryThing (pop qrels)', 'BeerAdvocate (pop qrels)']
names =   ['MovieLens 1M', 'LibraryThing', 'BeerAdvocate']

precs =   ['P_5', 'P_10', 'P_20', 'P_30', 'P_40', 'P_50', 'P_60', 'P_70',
           'P_80', 'P_90', 'P_100']
recalls = ['recall_5', 'recall_10', 'recall_20', 'recall_30', 'recall_40',
           'recall_50', 'recall_60', 'recall_70', 'recall_80', 'recall_90',
           'recall_100']
maps =    ['map_cut_5', 'map_cut_10', 'map_cut_20', 'map_cut_30',
           'map_cut_40', 'map_cut_50', 'map_cut_60', 'map_cut_70',
           'map_cut_80', 'map_cut_90', 'map_cut_100']
ndcgs =   ['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_20', 'ndcg_cut_30',
           'ndcg_cut_40', 'ndcg_cut_50', 'ndcg_cut_60', 'ndcg_cut_70',
           'ndcg_cut_80', 'ndcg_cut_90', 'ndcg_cut_100']
ndcg2s =  ['ndcg2_cut_5', 'ndcg2_cut_10', 'ndcg2_cut_20', 'ndcg2_cut_30',
           'ndcg2_cut_40', 'ndcg2_cut_50', 'ndcg2_cut_60', 'ndcg2_cut_70',
           'ndcg2_cut_80', 'ndcg2_cut_90', 'ndcg2_cut_100']
ndcg45s = ['ndcg45_cut_5', 'ndcg45_cut_10', 'ndcg45_cut_20', 'ndcg45_cut_30',
           'ndcg45_cut_40', 'ndcg45_cut_50', 'ndcg45_cut_60', 'ndcg45_cut_70',
           'ndcg45_cut_80', 'ndcg45_cut_90', 'ndcg45_cut_100']
qms =     ['qm_cut_5', 'qm_cut_10', 'qm_cut_20', 'qm_cut_30',
           'qm_cut_40', 'qm_cut_50', 'qm_cut_60', 'qm_cut_70',
           'qm_cut_80', 'qm_cut_90', 'qm_cut_100']
qm45s =   ['qm45_cut_5', 'qm45_cut_10', 'qm45_cut_20', 'qm45_cut_30',
           'qm45_cut_40', 'qm45_cut_50', 'qm45_cut_60', 'qm45_cut_70',
           'qm45_cut_80', 'qm45_cut_90', 'qm45_cut_100']

for metric_name, metrics in zip(
    ['precision', 'recall', 'map', 'ndcg', 'ndcg2', 'ndcg45', 'qm', 'qm45'],
    [precs, recalls, maps, ndcgs, ndcg2s, ndcg45s, qms, qm45s]):
    with Pool() as pool:
        taus_cutoffs = pool.starmap(computeTau, [(df, metrics) for df in dfs])

    plot_in_pdf('results/pop/pop_kendall_' + metric_name + '_cutoffs.pdf', titles, taus_cutoffs, "Kendall's tau")
    write_excel('results/pop/pop_kendall_' + metric_name + '_cutoffs.xlsx', taus_cutoffs, names)

In [9]:
ndcgs = ['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_20', 'ndcg_cut_30',
         'ndcg_cut_40', 'ndcg_cut_50', 'ndcg_cut_60', 'ndcg_cut_70',
         'ndcg_cut_80', 'ndcg_cut_90', 'ndcg_cut_100']

df = computeTau(df1M, ndcgs)
df.columns = ['nDCG@5', 'nDCG@10', 'nDCG@20', 'nDCG@30', 'nDCG@40','nDCG@50',
              'nDCG@60', 'nDCG@70', 'nDCG@80', 'nDCG@90', 'nDCG@100']
df.index = df.index * 100

In [10]:
matplotlib.rcParams['figure.figsize'] = [6, 2.8]
sns.set_style("ticks")
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

p = sns.color_palette("Set1", 9)
p.append((0.090196078431372548, 0.74509803921568629, 0.81176470588235294))
p.append((0.73725490196078436, 0.74117647058823533, 0.13333333333333333))

dfs = [df1M, dfLT, dfBA]
ndcgs = ['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_20', 'ndcg_cut_30',
         'ndcg_cut_40', 'ndcg_cut_50', 'ndcg_cut_60', 'ndcg_cut_70',
         'ndcg_cut_80', 'ndcg_cut_90', 'ndcg_cut_100']
labels = ['nDCG@5', 'nDCG@10', 'nDCG@20', 'nDCG@30', 'nDCG@40','nDCG@50',
          'nDCG@60', 'nDCG@70', 'nDCG@80', 'nDCG@90', 'nDCG@100']

with PdfPages('results/pop/pop_kendall_ndcg_cutoffs_final.pdf') as pdf:
    for df in dfs:
        x = computeTau(df, ndcgs)
        x.columns = labels
        x.index = x.index * 100
        with sns.color_palette(p):
            ax = x.plot(style=styles, linewidth=1.3, markersize=7)
            ax.invert_xaxis()
            ax.legend(ncol=3)
            ax.set_ylabel("Kendall's $\\tau$")
            ax.set_xlabel("\% least popular items in the test set")
            ax.set_xlim([100.0, 80.0])
            ax.set_ylim([0.0, 1.0])
            plt.xticks(np.arange(100, 79, -5))
            sns.despine()
            pdf.savefig(bbox_inches='tight')
            plt.close()

In [11]:
matplotlib.rcParams['figure.figsize'] = [6, 2.8]
sns.set_style("ticks")
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

p = sns.color_palette("Set1", 9)
p.append((0.090196078431372548, 0.74509803921568629, 0.81176470588235294))
p.append((0.73725490196078436, 0.74117647058823533, 0.13333333333333333))

metrics = ['set_P', 'set_recall', 'map',  'ndcg', 'recip_rank', 'bpref', 'infAP2']
new_names = ['P', 'Recall', 'MAP', 'nDCG', 'MRR', 'bpref', 'infAP']

with PdfPages('results/pop/pop_kendall_final.pdf') as pdf:
    for df in taus:
        x = df[metrics]
        x.columns = new_names
        x.index *= 100
        with sns.color_palette(p):
            ax = x.plot(style=styles, linewidth=1.3, markersize=7)
            ax.invert_xaxis()
            ax.legend(ncol=3)
            ax.set_ylabel("Kendall's $\\tau$")
            ax.set_xlabel("\% least popular items in the test set")
            ax.set_xlim([100.0, 80.0])
            ax.set_ylim([0.0, 1.0])
            plt.xticks(np.arange(100, 79, -5))
            sns.despine()
            pdf.savefig(bbox_inches='tight')
            plt.close()

In [12]:
df1M.loc[0.95]

Unnamed: 0_level_0,P_10,P_100,P_20,P_30,P_40,P_5,P_50,P_60,P_70,P_80,...,recall_60,recall_70,recall_80,recall_90,recip_rank,rmse,set_F,set_P,set_recall,yaap
Recommender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
run-CHI2-ml-1M,0.1072,0.0479,0.089,0.0783,0.0707,0.1258,0.0649,0.0602,0.0565,0.0533,...,0.3477,0.3763,0.4011,0.4221,0.2757,3.5744,0.0768,0.0479,0.4422,-1.4522
run-BPRMF-ml-1M,0.0684,0.048,0.0644,0.062,0.0596,0.0714,0.0574,0.0553,0.0532,0.0514,...,0.3311,0.3631,0.3939,0.4223,0.1794,1.4879,0.0771,0.048,0.4475,-1.5643
run-HT-ml-1M,0.0064,0.0154,0.0092,0.0113,0.0127,0.004,0.0137,0.0144,0.0148,0.0152,...,0.1306,0.1549,0.1779,0.199,0.0301,2.911,0.0265,0.0154,0.2166,-1.9857
run-KLD-ml-1M,0.0375,0.0382,0.0423,0.0433,0.0432,0.0302,0.0426,0.0418,0.0411,0.0401,...,0.2579,0.2893,0.3165,0.3413,0.0957,3.5092,0.0613,0.0382,0.3644,-1.742
run-LDA-ml-1M,0.0625,0.0466,0.0624,0.0602,0.0581,0.0624,0.0558,0.0537,0.0516,0.0499,...,0.309,0.3424,0.3715,0.4004,0.1577,3.5275,0.0747,0.0466,0.4248,-1.6067
run-LM-WSR-UB-ml-1M,0.0431,0.0431,0.05,0.0513,0.0509,0.0341,0.05,0.0486,0.0471,0.0458,...,0.2993,0.3304,0.3601,0.3861,0.1027,27.8216,0.0693,0.0431,0.4097,-1.6863
run-LM-WSR-IB-ml-1M,0.0444,0.0435,0.0484,0.0503,0.0499,0.0396,0.0495,0.0484,0.0473,0.0459,...,0.2946,0.3263,0.3543,0.381,0.1115,25.2802,0.0698,0.0435,0.4051,-1.6688
run-NNCosNgbr-UB-ml-1M,0.0073,0.0074,0.0083,0.0085,0.0084,0.0067,0.0084,0.0081,0.0079,0.0077,...,0.0471,0.0527,0.059,0.0652,0.0252,0.8552,0.012,0.0074,0.0703,-2.0097
run-NNCosNgbr-IB-ml-1M,0.0145,0.012,0.014,0.0135,0.0133,0.0152,0.0131,0.0128,0.0126,0.0123,...,0.0638,0.0726,0.0813,0.0898,0.049,2.7531,0.0189,0.012,0.0974,-1.97
run-PLSA-ml-1M,0.0594,0.0466,0.0598,0.0587,0.0573,0.0579,0.0555,0.0534,0.0516,0.0498,...,0.3078,0.3422,0.3718,0.3985,0.1457,3.5468,0.0745,0.0466,0.4237,-1.6208


In [13]:
def get_rankings(df, metric):
    x = pd.DataFrame()
    for percentage in df.index.levels[0]:
        x[percentage] = df.loc[percentage].sort_values([metric], ascending=False).index
    return x

In [14]:
get_rankings(df1M, "ndcg")

Unnamed: 0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,...,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99,1.0
0,run-HT-ml-1M,run-HT-ml-1M,run-HT-ml-1M,run-HT-ml-1M,run-HT-ml-1M,run-HT-ml-1M,run-HT-ml-1M,run-HT-ml-1M,run-HT-ml-1M,run-HT-ml-1M,...,run-CHI2-ml-1M,run-CHI2-ml-1M,run-CHI2-ml-1M,run-CHI2-ml-1M,run-CHI2-ml-1M,run-CHI2-ml-1M,run-CHI2-ml-1M,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-SLIM-ml-1M
1,run-Random-ml-1M,run-Random-ml-1M,run-Random-ml-1M,run-Random-ml-1M,run-Random-ml-1M,run-Random-ml-1M,run-Random-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-CHI2-ml-1M,...,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-WRMF-ml-1M,run-WRMF-ml-1M,run-BPRMF-ml-1M
2,run-NNCosNgbr-IB-ml-1M,run-SVD-ml-1M,run-SVD-ml-1M,run-NNCosNgbr-UB-ml-1M,run-NNCosNgbr-UB-ml-1M,run-NNCosNgbr-UB-ml-1M,run-RM2-ml-1M,run-Random-ml-1M,run-CHI2-ml-1M,run-RM2-ml-1M,...,run-WRMF-ml-1M,run-WRMF-ml-1M,run-WRMF-ml-1M,run-WRMF-ml-1M,run-WRMF-ml-1M,run-WRMF-ml-1M,run-WRMF-ml-1M,run-CHI2-ml-1M,run-SLIM-ml-1M,run-WRMF-ml-1M
3,run-RM1-ml-1M,run-NNCosNgbr-UB-ml-1M,run-NNCosNgbr-UB-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-NNCosNgbr-UB-ml-1M,run-SVD-ml-1M,run-Random-ml-1M,run-BPRMF-ml-1M,...,run-SLIM-ml-1M,run-SLIM-ml-1M,run-SLIM-ml-1M,run-SLIM-ml-1M,run-SLIM-ml-1M,run-SLIM-ml-1M,run-SLIM-ml-1M,run-SLIM-ml-1M,run-CHI2-ml-1M,run-LM-WSR-UB-ml-1M
4,run-CHI2-ml-1M,run-NNCosNgbr-IB-ml-1M,run-RM2-ml-1M,run-SVD-ml-1M,run-SVD-ml-1M,run-SVD-ml-1M,run-SVD-ml-1M,run-NNCosNgbr-UB-ml-1M,run-BPRMF-ml-1M,run-Random-ml-1M,...,run-LDA-ml-1M,run-LDA-ml-1M,run-LDA-ml-1M,run-LDA-ml-1M,run-LDA-ml-1M,run-LDA-ml-1M,run-LDA-ml-1M,run-PureSVD-ml-1M,run-PureSVD-ml-1M,run-PureSVD-ml-1M
5,run-RW-ml-1M,run-RM2-ml-1M,run-NNCosNgbr-IB-ml-1M,run-BPRMF-ml-1M,run-CHI2-ml-1M,run-CHI2-ml-1M,run-CHI2-ml-1M,run-CHI2-ml-1M,run-NNCosNgbr-UB-ml-1M,run-LM-WSR-IB-ml-1M,...,run-PLSA-ml-1M,run-PLSA-ml-1M,run-PLSA-ml-1M,run-PLSA-ml-1M,run-PLSA-ml-1M,run-PLSA-ml-1M,run-PureSVD-ml-1M,run-LDA-ml-1M,run-LDA-ml-1M,run-RM2-ml-1M
6,run-SVD-ml-1M,run-RM1-ml-1M,run-RM1-ml-1M,run-LM-WSR-IB-ml-1M,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-BPRMF-ml-1M,run-SVD-ml-1M,run-SVD-ml-1M,...,run-LM-WSR-IB-ml-1M,run-PureSVD-ml-1M,run-PureSVD-ml-1M,run-PureSVD-ml-1M,run-PureSVD-ml-1M,run-PureSVD-ml-1M,run-PLSA-ml-1M,run-PLSA-ml-1M,run-PLSA-ml-1M,run-LM-WSR-IB-ml-1M
7,run-SLIM-ml-1M,run-PLSA-ml-1M,run-BPRMF-ml-1M,run-NNCosNgbr-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-RM1-ml-1M,run-LM-WSR-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-SLIM-ml-1M,...,run-PureSVD-ml-1M,run-LM-WSR-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-LM-WSR-IB-ml-1M,run-LDA-ml-1M
8,run-WRMF-ml-1M,run-UIR-ml-1M,run-LM-WSR-IB-ml-1M,run-CHI2-ml-1M,run-RM1-ml-1M,run-LM-WSR-IB-ml-1M,run-RM1-ml-1M,run-RM1-ml-1M,run-RM1-ml-1M,run-RM1-ml-1M,...,run-LM-WSR-UB-ml-1M,run-LM-WSR-UB-ml-1M,run-LM-WSR-UB-ml-1M,run-LM-WSR-UB-ml-1M,run-LM-WSR-UB-ml-1M,run-LM-WSR-UB-ml-1M,run-LM-WSR-UB-ml-1M,run-LM-WSR-UB-ml-1M,run-LM-WSR-UB-ml-1M,run-KLD-ml-1M
9,run-BPRMF-ml-1M,run-RW-ml-1M,run-CHI2-ml-1M,run-RM1-ml-1M,run-NNCosNgbr-IB-ml-1M,run-NNCosNgbr-IB-ml-1M,run-SLIM-ml-1M,run-SLIM-ml-1M,run-SLIM-ml-1M,run-NNCosNgbr-UB-ml-1M,...,run-RM2-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-RM2-ml-1M,run-PLSA-ml-1M
