In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import seaborn as sns
matplotlib.rcParams['figure.figsize'] = [16.0, 10.0]

In [2]:
df1M = pd.read_csv("data/results-all-ml-1M.csv", index_col=["Percentage", "Recommender"])
df1M.drop(["Sample"], axis=1, inplace=True)
merged1M = df1M.groupby(level=["Percentage", "Recommender"]).mean()

dfLT = pd.read_csv("data/results-all-libraryThing.csv", index_col=["Percentage", "Recommender"])
dfLT.drop(["Sample"], axis=1, inplace=True)
mergedLT = dfLT.groupby(level=["Percentage", "Recommender"]).mean()

dfBA = pd.read_csv("data/results-all-beerAdvocate.csv", index_col=["Percentage", "Recommender"])
dfBA.drop(["Sample"], axis=1, inplace=True)
mergedBA = dfBA.groupby(level=["Percentage", "Recommender"]).mean()

In [3]:
dfs = [merged1M, mergedLT, mergedBA]
taus = [df.loc[1.00].corr(method='kendall') for df in dfs]

In [4]:
import matplotlib.pyplot as plt
import re
from matplotlib.backends.backend_pdf import PdfPages


def without_cutoff(metric):
    return re.fullmatch('.*_[0-9]+', metric) is None

def plot_clustermap_in_pdf(filename, titles, dfs, ylabel, metrics=None):
    with PdfPages(filename) as pdf:
        for title, df in zip(titles, dfs):
            if metrics is None:
                metrics = list(filter(without_cutoff, df.columns))
            cg = sns.clustermap(df.loc[metrics][metrics], linewidths=.5, method='ward', cmap="YlGnBu", vmin=0.75, vmax=1.0, annot_kws={"size": 10})
            cg.ax_heatmap.set_title(title)
            plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
            pdf.savefig(bbox_inches='tight')
            plt.close()

def plot_heatmap_in_pdf(filename, titles, dfs, ylabel, metrics=None):
    with PdfPages(filename) as pdf:
        for title, df in zip(titles, dfs):
            if metrics is None:
                metrics = list(filter(without_cutoff, df.columns))
            ax = sns.heatmap(df.loc[metrics][metrics], linewidths=.5, annot=True, fmt='.2f', cmap="YlGnBu", annot_kws={"size": 10}, vmin=0.75, vmax=1.0)
            ax.set_title(title)
            ax.set_ylabel(ylabel)
            pdf.savefig(bbox_inches='tight')
            plt.close()

In [5]:
from xlsxwriter.utility import xl_col_to_name

def write_excel(filename, dfs, names):
    with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
        for df, name in zip(dfs, names):
            df.to_excel(writer, sheet_name=name)
            n_percentages = len(df.index)
            count = 1
            worksheet = writer.sheets[name]
            median = df.median().mean()
            minimum = df.min().min()
            for column in df.columns:
                fmt = "{c}2:{c}{n}".format(c=xl_col_to_name(count), n=n_percentages + 1)
                count += 1
                worksheet.conditional_format(fmt, {'type': '3_color_scale',
                                                   'min_type': 'num',
                                                   'mid_type': 'num',
                                                   'max_type': 'num',
                                                   'min_value': minimum,
                                                   'mid_value': median,
                                                   'max_value': 1.0})

In [6]:
from multiprocessing import Pool


titles = ['MovieLens 1M', 'LibraryThing', 'BeerAdvocate']

metrics = ['bpref', 'err', 'infAP2', 'map', 'ndcg', 'qm', 'recip_rank', 'set_F', 'set_P', 'set_recall']

with Pool() as pool:
    pool.apply_async(plot_heatmap_in_pdf,
                     ['results/correlation/heatmap_kendall_metrics.pdf', titles, taus, "Kendall's tau", metrics])
    pool.apply_async(plot_clustermap_in_pdf,
                     ['results/correlation/clustermap_kendall_metrics.pdf', titles, taus, "Kendall's tau", metrics])
    pool.apply_async(write_excel,
                     ['results/correlation/kendall_metrics.xlsx', taus, titles])
    pool.close()
    pool.join()

In [7]:
from multiprocessing import Pool


titles = ['MovieLens 1M', 'LibraryThing', 'BeerAdvocate']

precs =   ['P_5', 'P_10', 'P_20', 'P_30', 'P_40', 'P_50', 'P_60', 'P_70',
           'P_80', 'P_90', 'P_100']
recalls = ['recall_5', 'recall_10', 'recall_20', 'recall_30', 'recall_40',
           'recall_50', 'recall_60', 'recall_70', 'recall_80', 'recall_90',
           'recall_100']
maps =    ['map_cut_5', 'map_cut_10', 'map_cut_20', 'map_cut_30',
           'map_cut_40', 'map_cut_50', 'map_cut_60', 'map_cut_70',
           'map_cut_80', 'map_cut_90', 'map_cut_100']
ndcgs =   ['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_20', 'ndcg_cut_30',
           'ndcg_cut_40', 'ndcg_cut_50', 'ndcg_cut_60', 'ndcg_cut_70',
           'ndcg_cut_80', 'ndcg_cut_90', 'ndcg_cut_100']
ndcg2s =  ['ndcg2_cut_5', 'ndcg2_cut_10', 'ndcg2_cut_20', 'ndcg2_cut_30',
           'ndcg2_cut_40', 'ndcg2_cut_50', 'ndcg2_cut_60', 'ndcg2_cut_70',
           'ndcg2_cut_80', 'ndcg2_cut_90', 'ndcg2_cut_100']
ndcg45s = ['ndcg45_cut_5', 'ndcg45_cut_10', 'ndcg45_cut_20', 'ndcg45_cut_30',
           'ndcg45_cut_40', 'ndcg45_cut_50', 'ndcg45_cut_60', 'ndcg45_cut_70',
           'ndcg45_cut_80', 'ndcg45_cut_90', 'ndcg45_cut_100']
qms =     ['qm_cut_5', 'qm_cut_10', 'qm_cut_20', 'qm_cut_30',
           'qm_cut_40', 'qm_cut_50', 'qm_cut_60', 'qm_cut_70',
           'qm_cut_80', 'qm_cut_90', 'qm_cut_100']
qm45s =   ['qm45_cut_5', 'qm45_cut_10', 'qm45_cut_20', 'qm45_cut_30',
           'qm45_cut_40', 'qm45_cut_50', 'qm45_cut_60', 'qm45_cut_70',
           'qm45_cut_80', 'qm45_cut_90', 'qm45_cut_100']

with Pool() as pool:
    for metric_name, metrics in zip(
        ['precision', 'recall', 'map', 'ndcg', 'ndcg2', 'ndcg45', 'qm', 'qm45'],
        [precs, recalls, maps, ndcgs, ndcg2s, ndcg45s, qms, qm45s]):
        
        pool.apply_async(plot_heatmap_in_pdf,
                         ['results/correlation/heatmap_kendall_' + metric_name + '_cutoffs.pdf',
                          titles, taus, "Kendall's tau", metrics])
    pool.close()
    pool.join()

In [8]:
matplotlib.rcParams['figure.figsize'] = [6, 2.8]
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

ndcgs = ['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_20', 'ndcg_cut_30',
         'ndcg_cut_40', 'ndcg_cut_50', 'ndcg_cut_60', 'ndcg_cut_70',
         'ndcg_cut_80', 'ndcg_cut_90', 'ndcg_cut_100']

cutoffs = ['@5', '@10', '@20', '@30', '@40', '@50', '@60', '@70', '@80', '@90', '@100']

df = taus[0].loc[ndcgs][ndcgs]
df.columns = cutoffs
df.index = pd.Index(cutoffs)

with PdfPages('results/correlation/heatmap_kendall_ndcg_cutoffs_final.pdf') as pdf:
    ax = sns.heatmap(df,
                     linewidths=.5,
                     vmin=0.80,
                     vmax=1.0,
                     cmap="YlGnBu",
                     annot=True,
                     fmt='.2f',
                     annot_kws={"size": 10},
                     cbar=False,
                     square=False)
    plt.yticks(rotation=0)
    ax.xaxis.tick_top()
    pdf.savefig(bbox_inches='tight')
    plt.close()

In [9]:
matplotlib.rcParams['figure.figsize'] = [4, 2.24]
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

metrics = ['set_P', 'set_recall', 'map',  'ndcg', 'recip_rank', 'bpref', 'infAP2']
new_names = ['P', 'Recall', 'MAP', 'nDCG', 'MRR', 'bpref', 'infAP']

with PdfPages('results/correlation/heatmap_kendall_metrics_final.pdf') as pdf:
    for df in taus:
        df = df.loc[metrics][metrics]
        df.columns = new_names
        df.index = pd.Index(new_names)
        ax = sns.heatmap(df,
                         linewidths=.5,
                         vmin=0.80,
                         vmax=1.0,
                         cmap="YlGnBu",
                         annot=True,
                         fmt='.2f',
                         annot_kws={"size": 10},
                         cbar=False,
                         square=False)
        plt.yticks(rotation=0)
        ax.xaxis.tick_top()
        pdf.savefig(bbox_inches='tight')
        plt.close()

In [10]:
matplotlib.rcParams['figure.figsize'] = [6, 3.4]
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

titles = ['MovieLens 1M', 'LibraryThing', 'BeerAdvocate']
metrics = ['set_P', 'set_recall', 'map',  'ndcg', 'recip_rank', 'bpref', 'infAP2']
new_names = ['P', 'Recall', 'MAP', 'nDCG', 'MRR', 'bpref', 'infAP']

with PdfPages('results/correlation/clustermap_kendall_metrics_final.pdf') as pdf:
    for df, title in zip(taus, titles):
        df = df.loc[metrics][metrics]
        df.columns = new_names
        df.index = pd.Index(new_names)
        cg = sns.clustermap(df, linewidths=.5, method='ward', cmap="YlGnBu", vmin=0.80, vmax=1.0)
        cg.ax_heatmap.set_title(title)
        plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
        pdf.savefig(bbox_inches='tight')
        plt.close()