In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import seaborn as sns
matplotlib.rcParams['figure.figsize'] = [16.0, 10.0]

In [2]:
from itertools import cycle, islice

styles = list(islice(cycle(['X-', '<-', '.-', 'v-', '*-', 'o-', '>-', '^-', 'd-', 's-', '+-']), 100))
markers = list(islice(cycle(['X', '<', '.', 'v', '*', 'o', '>', '^', 'd', 's', '+']), 100))

def get_markers(n):
    return markers[:n]

In [3]:
df1M = pd.read_csv("data/results-all-ml-1M.csv", index_col=('Percentage', 'Sample', 'Recommender'))
dfLT = pd.read_csv("data/results-all-libraryThing.csv", index_col=('Percentage', 'Sample', 'Recommender'))
dfBA = pd.read_csv("data/results-all-beerAdvocate.csv", index_col=('Percentage', 'Sample', 'Recommender'))

In [4]:
import re


def without_cutoff(metric):
    return re.fullmatch('.*_[0-9]+', metric) is None
    
def computeTau(df, metrics=None):
    percentages = df.index.levels[0]
    samples = df.index.levels[1]
    if metrics is None:
        metrics = list(filter(without_cutoff, df.columns))
    n_samples = len(samples)
    tau = pd.DataFrame(0.0, columns=metrics, index=percentages)

    for percentage in percentages:
        # Reference data
        refdf = df.xs((1.00, 0), level=('Percentage', 'Sample'))

        for sample in samples:
            # Comparison data
            mydf = df.xs((percentage, sample), level=('Percentage', 'Sample'))

            for metric in metrics:
                # Compute Kendall for each metric
                tau[metric][percentage] += mydf[metric].corr(refdf[metric], method="kendall")

    return tau / n_samples

In [5]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages


def plot_in_pdf(filename, titles, dfs, ylabel, xlim=[1.0, 0.0], ylim=[0.8, 1.0]):
    with PdfPages(filename) as pdf:
        for title, df in zip(titles, dfs):
            ax = df.plot(colormap='Set1', style=styles)
            ax.invert_xaxis()
            ax.legend(bbox_to_anchor=(1.02, 1), loc=2)
            ax.set_title(title)
            ax.set_ylabel(ylabel)
            ax.set_xlim(xlim)
            ax.set_ylim(ylim)
            pdf.savefig(bbox_inches='tight')
            plt.close()

In [6]:
from xlsxwriter.utility import xl_col_to_name


def write_excel(filename, dfs, names):
    with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
        for df, name in zip(dfs, names):
            df.to_excel(writer, sheet_name=name)
            n_percentages = len(df.index)
            count = 1
            worksheet = writer.sheets[name]
            median = df.median().mean()
            minimum = df.min().min()
            for column in df.columns:
                fmt = "{c}2:{c}{n}".format(c=xl_col_to_name(count), n=n_percentages + 1)
                count += 1
                worksheet.conditional_format(fmt, {'type': '3_color_scale',
                                                   'min_type': 'num',
                                                   'mid_type': 'num',
                                                   'max_type': 'num',
                                                   'min_value': minimum,
                                                   'mid_value': median,
                                                   'max_value': 1.0})

In [7]:
from multiprocessing import Pool


dfs = [df1M, dfLT, dfBA]
metrics = ['bpref', 'err', 'infAP2', 'map', 'ndcg', 'qm', 'recip_rank', 'set_F', 'set_P', 'set_recall']

with Pool() as pool:
    taus = pool.starmap(computeTau, [(df, metrics) for df in dfs])
    
titles = ['MovieLens 1M (sparse qrels)', 'LibraryThing (sparse qrels)', 'BeerAdvocate (sparse qrels)']
names = ['MovieLens 1M', 'LibraryThing', 'BeerAdvocate']

plot_in_pdf('results/sparse/sparse_kendall.pdf', titles, taus, "Kendall's tau")
write_excel('results/sparse/sparse_kendall.xlsx', taus, names)

In [8]:
from multiprocessing import Pool


dfs = [df1M, dfLT, dfBA]
titles =  ['MovieLens 1M (sparse qrels)', 'LibraryThing (sparse qrels)', 'BeerAdvocate (sparse qrels)']
names =   ['MovieLens 1M', 'LibraryThing', 'BeerAdvocate']

precs =   ['P_5', 'P_10', 'P_20', 'P_30', 'P_40', 'P_50', 'P_60', 'P_70',
           'P_80', 'P_90', 'P_100']
recalls = ['recall_5', 'recall_10', 'recall_20', 'recall_30', 'recall_40',
           'recall_50', 'recall_60', 'recall_70', 'recall_80', 'recall_90',
           'recall_100']
maps =    ['map_cut_5', 'map_cut_10', 'map_cut_20', 'map_cut_30',
           'map_cut_40', 'map_cut_50', 'map_cut_60', 'map_cut_70',
           'map_cut_80', 'map_cut_90', 'map_cut_100']
ndcgs =   ['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_20', 'ndcg_cut_30',
           'ndcg_cut_40', 'ndcg_cut_50', 'ndcg_cut_60', 'ndcg_cut_70',
           'ndcg_cut_80', 'ndcg_cut_90', 'ndcg_cut_100']
ndcg2s =  ['ndcg2_cut_5', 'ndcg2_cut_10', 'ndcg2_cut_20', 'ndcg2_cut_30',
           'ndcg2_cut_40', 'ndcg2_cut_50', 'ndcg2_cut_60', 'ndcg2_cut_70',
           'ndcg2_cut_80', 'ndcg2_cut_90', 'ndcg2_cut_100']
ndcg45s = ['ndcg45_cut_5', 'ndcg45_cut_10', 'ndcg45_cut_20', 'ndcg45_cut_30',
           'ndcg45_cut_40', 'ndcg45_cut_50', 'ndcg45_cut_60', 'ndcg45_cut_70',
           'ndcg45_cut_80', 'ndcg45_cut_90', 'ndcg45_cut_100']
qms =     ['qm_cut_5', 'qm_cut_10', 'qm_cut_20', 'qm_cut_30',
           'qm_cut_40', 'qm_cut_50', 'qm_cut_60', 'qm_cut_70',
           'qm_cut_80', 'qm_cut_90', 'qm_cut_100']
qm45s =   ['qm45_cut_5', 'qm45_cut_10', 'qm45_cut_20', 'qm45_cut_30',
           'qm45_cut_40', 'qm45_cut_50', 'qm45_cut_60', 'qm45_cut_70',
           'qm45_cut_80', 'qm45_cut_90', 'qm45_cut_100']

for metric_name, metrics in zip(
    ['precision', 'recall', 'map', 'ndcg', 'ndcg2', 'ndcg45', 'qm', 'qm45'],
    [precs, recalls, maps, ndcgs, ndcg2s, ndcg45s, qms, qm45s]):
    with Pool() as pool:
        taus_cutoffs = pool.starmap(computeTau, [(df, metrics) for df in dfs])

    plot_in_pdf('results/sparse/sparse_kendall_' + metric_name + '_cutoffs.pdf', titles, taus_cutoffs, "Kendall's tau")
    write_excel('results/sparse/sparse_kendall_' + metric_name + '_cutoffs.xlsx', taus_cutoffs, names)

In [9]:
matplotlib.rcParams['figure.figsize'] = [6, 2.8]
sns.set_style("ticks")
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

p = sns.color_palette("Set1", 9)
p.append((0.090196078431372548, 0.74509803921568629, 0.81176470588235294))
p.append((0.73725490196078436, 0.74117647058823533, 0.13333333333333333))

ndcgs = ['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_20', 'ndcg_cut_30',
         'ndcg_cut_40', 'ndcg_cut_50', 'ndcg_cut_60', 'ndcg_cut_70',
         'ndcg_cut_80', 'ndcg_cut_90', 'ndcg_cut_100']

df = computeTau(df1M, ndcgs)
df.columns = ['nDCG@5', 'nDCG@10', 'nDCG@20', 'nDCG@30', 'nDCG@40','nDCG@50',
              'nDCG@60', 'nDCG@70', 'nDCG@80', 'nDCG@90', 'nDCG@100']
df.index = df.index * 100

with PdfPages('results/sparse/sparse_kendall_ndcg_cutoffs_final.pdf') as pdf:
    with sns.color_palette(p):
        ax = df.plot(style=styles, linewidth=1.3, markersize=7)
        ax.invert_xaxis()
        ax.set_ylabel("Kendall's $\\tau$")
        ax.set_xlabel("\% of ratings in the test set")
        ax.set_xlim([100.0, 0.0])
        ax.set_ylim([0.85, 1.0])
        ax.legend(ncol=2)
        plt.xticks(np.arange(100, -10, -10))
        plt.yticks(np.arange(0.85, 1.0, 0.05))
        sns.despine()
        pdf.savefig(bbox_inches='tight')
        plt.close()

In [10]:
matplotlib.rcParams['figure.figsize'] = [6, 2.8]
sns.set_style("ticks")
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

p = sns.color_palette("Set1", 9)
p.append((0.090196078431372548, 0.74509803921568629, 0.81176470588235294))
p.append((0.73725490196078436, 0.74117647058823533, 0.13333333333333333))

metrics = ['set_P', 'set_recall', 'map',  'ndcg', 'recip_rank', 'bpref', 'infAP2']
new_names = ['P', 'Recall', 'MAP', 'nDCG', 'MRR', 'bpref', 'infAP']

with PdfPages('results/sparse/sparse_kendall_final.pdf') as pdf:
    for df in taus:
        x = df[metrics]
        x.columns = new_names
        x.index *= 100
        with sns.color_palette(p):
            ax = x.plot(style=styles, linewidth=1.3, markersize=7)
            ax.invert_xaxis()
            ax.set_ylabel("Kendall's $\\tau$")
            ax.set_xlabel("\% of ratings in the test set")
            ax.set_xlim([100.0, 0.0])
            ax.set_ylim([0.85, 1.0])
            plt.xticks(np.arange(100, -10, -10))
            plt.yticks(np.arange(0.85, 1.0, 0.05))
            sns.despine()
            pdf.savefig(bbox_inches='tight')
            plt.close()