In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab

matplotlib.rcParams['figure.figsize'] = [16.0, 10.0]

In [2]:
df1M = pd.read_csv("data/discriminative-ml-1M.csv", index_col=['RunA', 'RunB'], delimiter=',')
dfLT = pd.read_csv("data/discriminative-libraryThing.csv", index_col=['RunA', 'RunB'], delimiter=',')
dfBA = pd.read_csv("data/discriminative-beerAdvocate.csv", index_col=['RunA', 'RunB'], delimiter=',')

In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import re

from matplotlib.backends.backend_pdf import PdfPages
from itertools import cycle, islice


def without_cutoff(metric):
    return re.fullmatch('.*_[0-9]+', metric) is None

def plot_in_pdf(filename, names, dfs, metrics=None, xlim=[0, 50]):
    with PdfPages(filename) as pdf:
        for name, df in zip(names, dfs):
            if metrics is None:
                metrics = list(filter(without_cutoff, df.columns))
            markers = islice(cycle(['X', '<', '.', 'v', '*', 'o', '>', '^', 'd', 's', '+']), 100)
            fig, ax = plt.subplots()
            for metric in metrics:
                pvalues = df[metric].sort_values(ascending=False).values
                ax.plot(pvalues, label=metric, marker=next(markers))
            ax.set_title("Discrimination analysis on " + name)
            ax.set_ylabel('p_value')
            ax.set_xlabel('pairs')
            ax.set_xlim(xlim)
            ax.legend(loc='lower right')
            pdf.savefig(bbox_inches='tight')
            plt.close()

In [4]:
names = ['MovieLens1M', 'LibraryThing', 'BeerAdvocate']
dfs = [df1M, dfLT, dfBA]
metrics = ['bpref', 'err', 'infAP2', 'map', 'ndcg', 'qm', 'recip_rank', 'set_F', 'set_P', 'set_recall']

plot_in_pdf('results/discrimination/discrimination_metrics.pdf', names, dfs, metrics)

In [5]:
names = ['MovieLens1M', 'LibraryThing', 'BeerAdvocate']
dfs = [df1M, dfLT, dfBA]

precs =   ['P_5', 'P_10', 'P_20', 'P_30', 'P_40', 'P_50', 'P_60', 'P_70',
           'P_80', 'P_90', 'P_100']
recalls = ['recall_5', 'recall_10', 'recall_20', 'recall_30', 'recall_40',
           'recall_50', 'recall_60', 'recall_70', 'recall_80', 'recall_90',
           'recall_100']
maps =    ['map_cut_5', 'map_cut_10', 'map_cut_20', 'map_cut_30',
           'map_cut_40', 'map_cut_50', 'map_cut_60', 'map_cut_70',
           'map_cut_80', 'map_cut_90', 'map_cut_100']
ndcgs =   ['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_20', 'ndcg_cut_30',
           'ndcg_cut_40', 'ndcg_cut_50', 'ndcg_cut_60', 'ndcg_cut_70',
           'ndcg_cut_80', 'ndcg_cut_90', 'ndcg_cut_100']
ndcg2s =  ['ndcg2_cut_5', 'ndcg2_cut_10', 'ndcg2_cut_20', 'ndcg2_cut_30',
           'ndcg2_cut_40', 'ndcg2_cut_50', 'ndcg2_cut_60', 'ndcg2_cut_70',
           'ndcg2_cut_80', 'ndcg2_cut_90', 'ndcg2_cut_100']
ndcg45s = ['ndcg45_cut_5', 'ndcg45_cut_10', 'ndcg45_cut_20', 'ndcg45_cut_30',
           'ndcg45_cut_40', 'ndcg45_cut_50', 'ndcg45_cut_60', 'ndcg45_cut_70',
           'ndcg45_cut_80', 'ndcg45_cut_90', 'ndcg45_cut_100']
qms =     ['qm_cut_5', 'qm_cut_10', 'qm_cut_20', 'qm_cut_30',
           'qm_cut_40', 'qm_cut_50', 'qm_cut_60', 'qm_cut_70',
           'qm_cut_80', 'qm_cut_90', 'qm_cut_100']
qm45s =   ['qm45_cut_5', 'qm45_cut_10', 'qm45_cut_20', 'qm45_cut_30',
           'qm45_cut_40', 'qm45_cut_50', 'qm45_cut_60', 'qm45_cut_70',
           'qm45_cut_80', 'qm45_cut_90', 'qm45_cut_100']

for metric_name, metrics in zip(
    ['precision', 'recall', 'map', 'ndcg', 'ndcg2', 'ndcg45', 'qm', 'qm45'],
    [precs, recalls, maps, ndcgs, ndcg2s, ndcg45s, qms, qm45s]):
    plot_in_pdf('results/discrimination/discrimination_' + metric_name + '_cutoffs.pdf', names, dfs, metrics)

In [6]:
from xlsxwriter.utility import xl_col_to_name


def write_excel_quantiles(filename, dfs, names, start=0.9, end=1.0, steps=11):
    with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
        for df, name in zip(dfs, names):
            df = df.quantile(np.linspace(start, end, steps))
            df.to_excel(writer, sheet_name=name)
            n_percentages = len(df.index)
            count = 1
            worksheet = writer.sheets[name]
            for column in df.columns:
                fmt = "{c}2:{c}{n}".format(c=xl_col_to_name(count), n=n_percentages + 1)
                count += 1
                worksheet.conditional_format(fmt, {'type': '3_color_scale',
                                                   'min_type': 'num',
                                                   'mid_type': 'num',
                                                   'max_type': 'num',
                                                   'min_value': 0.0,
                                                   'mid_value': 0.5,
                                                   'max_value': 1.0})

In [7]:
names = ['MovieLens1M', 'LibraryThing', 'BeerAdvocate']
dfs = [df1M, dfLT, dfBA]

write_excel_quantiles('results/discrimination/discrimination_quantiles.xlsx', dfs, names)

In [8]:
import seaborn as sns


matplotlib.rcParams['figure.figsize'] = [6, 2.8]
sns.set_style("ticks")
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

p = sns.color_palette("Set1", 9)
p.append((0.090196078431372548, 0.74509803921568629, 0.81176470588235294))
p.append((0.73725490196078436, 0.74117647058823533, 0.13333333333333333))


ndcgs = ['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_20', 'ndcg_cut_30',
         'ndcg_cut_40', 'ndcg_cut_50', 'ndcg_cut_60', 'ndcg_cut_70',
         'ndcg_cut_80', 'ndcg_cut_90', 'ndcg_cut_100']
new_names = ['nDCG@5', 'nDCG@10', 'nDCG@20', 'nDCG@30', 'nDCG@40','nDCG@50',
             'nDCG@60', 'nDCG@70', 'nDCG@80', 'nDCG@90', 'nDCG@100']

with PdfPages('results/discrimination/discrimination_ndcg_cutoffs_final.pdf') as pdf:
     for df in (df1M, dfLT, dfBA):
        x = df[ndcgs]
        x.columns = new_names
        with sns.color_palette(p):
            markers = islice(cycle(['X', '<', '.', 'v', '*', 'o', '>', '^', 'd', 's', '+']), 100)
            fig, ax = plt.subplots()
            for metric in x.columns:
                pvalues = x[metric].sort_values(ascending=False).values
                ax.plot(pvalues, label=metric, marker=next(markers), linewidth=1.3, markersize=7)
            ax.set_ylabel('$p$-value')
            ax.set_xlabel('pairs of recommender systems')
            ax.set_xlim([0, 25])
            plt.xticks(np.arange(0, 26, 5))
            ax.legend(ncol=2)
            sns.despine()
            pdf.savefig(bbox_inches='tight')
            plt.close()

In [9]:
import seaborn as sns


matplotlib.rcParams['figure.figsize'] = [6, 2.8]
sns.set_style("ticks")
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

p = sns.color_palette("Set1", 9)
p.append((0.090196078431372548, 0.74509803921568629, 0.81176470588235294))
p.append((0.73725490196078436, 0.74117647058823533, 0.13333333333333333))

metrics = ['set_P', 'set_recall', 'map',  'ndcg', 'recip_rank', 'bpref', 'infAP2']
new_names = ['P', 'Recall', 'MAP', 'nDCG', 'MRR', 'bpref', 'infAP']

with PdfPages('results/discrimination/discrimination_metrics_final.pdf') as pdf:
    for df in (df1M, dfLT, dfBA):
        x = df[metrics]
        x.columns = new_names
        with sns.color_palette(p):
            markers = islice(cycle(['X', '<', '.', 'v', '*', 'o', '>', '^', 'd', 's', '+']), 100)
            fig, ax = plt.subplots()
            for metric in x.columns:
                pvalues = x[metric].sort_values(ascending=False).values
                ax.plot(pvalues, label=metric, marker=next(markers), linewidth=1.3, markersize=7)
            ax.set_ylabel('$p$-value')
            ax.set_xlabel('pairs of recommender systems')
            ax.set_xlim([0, 25])
            plt.xticks(np.arange(0, 26, 5))
            ax.legend(ncol=2)
            sns.despine()
            pdf.savefig(bbox_inches='tight')
            plt.close()

In [10]:
metrics = ['set_P', 'set_recall', 'set_F', 'map',  'ndcg', 'recip_rank', 'err', 'bpref', 'infAP2']
new_names = ['P', 'Recall', 'MAP', 'F1', 'nDCG', 'MRR', 'ERR', 'bpref', 'infAP']


for df in dfs:
    x = df[metrics]
    x.columns = new_names
    print(x.sum().sort_values())
    print()
    
for df in dfs:
    x = df[ndcgs]
    print(x.sum().sort_values())
    print()

nDCG       1.3733
P          2.5566
F1         2.8398
MAP        4.2730
Recall     7.0396
infAP      8.4436
bpref      9.8846
ERR       10.7496
MRR       15.4912
dtype: float64

nDCG      0.2011
MAP       1.1442
P         1.4629
ERR       1.9650
MRR       2.9094
F1        3.5574
infAP     3.8457
bpref     5.3558
Recall    5.9159
dtype: float64

P          1.8462
MAP        2.6661
nDCG       4.4006
infAP      4.8300
ERR        5.2994
MRR        5.8029
Recall     8.3119
F1        10.7426
bpref     12.6614
dtype: float64

ndcg_cut_90     0.9271
ndcg_cut_50     1.1742
ndcg_cut_100    1.1994
ndcg_cut_80     1.4011
ndcg_cut_60     1.8606
ndcg_cut_40     2.1630
ndcg_cut_70     2.7763
ndcg_cut_30     3.5829
ndcg_cut_20     3.9404
ndcg_cut_5      4.7496
ndcg_cut_10     6.6123
dtype: float64

ndcg_cut_100    0.1753
ndcg_cut_80     0.2270
ndcg_cut_60     0.4514
ndcg_cut_90     0.7182
ndcg_cut_70     1.0168
ndcg_cut_50     1.1950
ndcg_cut_40     2.4770
ndcg_cut_20     3.0180
ndcg_cut_5      3.1339