# Evaluate Multi-label Classification

In [1]:
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

import torchfile

In [2]:
sys.path.append('src')
from evaluate import avgPrecisionK, evaluatePrecision, evaluateF1, evaluateRankingLoss, f1_score_nowarn
from evaluate import calc_F1, calc_precisionK
from datasets import create_dataset, dataset_names, nLabels_dict
from BinaryRelevance import BinaryRelevance
from PC import MLC_pclassification, obj_pclassification, avgF1

In [3]:
dataset_names

['yeast', 'scene', 'bibtex', 'bookmarks', 'delicious', 'mediamill']

In [4]:
data_ix = 3

In [5]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)

bookmarks 208


In [6]:
data_dir = 'data'
SEED = 918273645

## Load dataset

In [7]:
X_train, Y_train = create_dataset(dataset_name, train_data=True, shuffle=True, random_state=SEED)
X_test,  Y_test  = create_dataset(dataset_name, train_data=False)

Feature normalisation.

In [8]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test  -= X_train_mean
X_test  /= X_train_std

Save performance data.

In [9]:
def dump_perf(fname, perf_dict):
    if os.path.exists(fname):
        _dict = pkl.load(open(fname, 'rb'))
        if dataset_name not in _dict:
            _dict[dataset_name] = perf_dict
        else:
            _dict[dataset_name].update(perf_dict)
    else:
        _dict = {dataset_name: perf_dict}
    pkl.dump(_dict, open(fname, 'wb'))

## Evaluate DVN

In [10]:
preds_test = np.load(os.path.join(data_dir, dataset_name + '/preds_test_dvn.npy'))

In [11]:
preds_test.shape

(27856, 208)

In [12]:
Y_test.shape

(27856, 208)

In [13]:
f1mean = f1_score_nowarn(Y_test.astype(np.bool), preds_test >= 0.5, average='samples')
print(f1mean)

0.371555968605


In [14]:
F1_sample = np.mean(calc_F1(Y_test.astype(np.bool), preds_test >= 0.5))
print(F1_sample)

0.371555968605


In [15]:
F1_label = f1_score_nowarn(Y_test.astype(np.bool), preds_test >= 0.5, average='macro')
print(F1_label)

0.236748114984


In [16]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), preds_test)
avgPak = np.mean(pak)
print(avgPak)

0.422450083582


In [17]:
auc_dvn = roc_auc_score(Y_test, preds_test, average='samples')
print(auc_dvn)

0.768788536128


In [18]:
#perf_dict_test = evaluatePrecision(Y_test, preds_test, verbose=1)
#perf_dict_test.update(evaluateRankingLoss(Y_test, preds_test))
#print(label_ranking_loss(Y_test, preds_test))

In [19]:
perf_dict_test = {'F1_sample': F1_sample,
                  'F1_label':  F1_label,
                  'Precision@K': avgPak, 
                  'AUC_sample': auc_dvn,
                 }

In [20]:
fname = os.path.join(data_dir, 'perf-dvn.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))

{'bibtex': {'Test': {'F1_sample': 0.44700475542993195, 'F1_label': 0.32421320747927107, 'Precision@K': 0.50324643673871838, 'AUC_sample': 0.86738763289321019}}, 'bookmarks': {'Test': {'F1_sample': 0.37155596860541956, 'F1_label': 0.23674811498350729, 'Precision@K': 0.42245008358248076, 'AUC_sample': 0.76878853612824871}}}


## Evaluate SPEN

Pick the one with best sample F1.

In [21]:
gts = torchfile.load(os.path.join(data_dir, dataset_name + '/gts-500.torch'))

In [22]:
gts.shape

(27856, 208)

In [23]:
#np.all(np.equal(Y_test, gts))

In [24]:
preds = torchfile.load(os.path.join(data_dir, dataset_name + '/preds-500.torch'))

In [25]:
preds.shape

(27856, 208)

In [26]:
thresholds = [0, 0.05, 0.10, 0.15, 0.2, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70, 0.75]  # SPEN

In [27]:
F1_all_sample = []
F1_all_label  = []
for th in thresholds:
    F1_all_sample.append(f1_score_nowarn(gts, preds >= th, average='samples'))
    F1_all_label.append(f1_score_nowarn(gts, preds >= th, average='macro'))

In [28]:
bestix = np.argmax(F1_all_sample)
print(F1_all_sample[bestix], F1_all_label[bestix], thresholds[bestix])

0.355393845379 0.240943700849 0.1


In [29]:
pak, ind = calc_precisionK(gts.astype(np.bool), preds)
avgPak = np.mean(pak)
print(avgPak)

0.395885920049


In [30]:
auc_spen = roc_auc_score(gts, preds, average='samples')
print(auc_spen)

0.908463711209


In [31]:
#threshold = 0.05 # bibtex
#threshold = 0.1 # bookmarks
#threshold = 0.2 # delicious
#f1mean = f1_score_nowarn(gts, preds > threshold, average='samples')
#print(f1mean)

In [32]:
#perf_dict_test = evaluatePrecision(gts, preds, verbose=1)
#perf_dict_test.update(evaluateRankingLoss(gts, preds))
#print(label_ranking_loss(gts, preds))

In [33]:
perf_dict_test = {'F1_sample': F1_all_sample[bestix],
                  'F1_label': F1_all_label[bestix],
                  'Precision@K': avgPak, 
                  'AUC_sample': auc_spen,
                 }

In [34]:
fname = os.path.join(data_dir, 'perf-spen.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))

{'bibtex': {'Test': {'F1_sample': 0.41324037236538086, 'F1_label': 0.33658438599904672, 'Precision@K': 0.45575463430655805, 'AUC_sample': 0.9259305964575445}}, 'bookmarks': {'Test': {'F1_sample': 0.3553938453788133, 'F1_label': 0.24094370084927494, 'Precision@K': 0.39588592004939738, 'AUC_sample': 0.90846371120888569}}}


## Evaluate Binary Relevance

Independent Logistic Regression.

In [35]:
fname = os.path.join(data_dir, 'br-' + dataset_name + '-base.pkl')
br = pkl.load(open(fname, 'rb'))

In [36]:
Y_pred = br.decision_function(X_test)

In [37]:
F1_sample = np.mean(f1_score_nowarn(Y_test, Y_pred >= 0, average='samples'))
print(F1_sample)

0.295202477424


In [38]:
F1_label = np.mean(f1_score_nowarn(Y_test, Y_pred >= 0, average='macro'))
print(F1_label)

0.210030397541


In [39]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), Y_pred)
avgPak = np.mean(pak)
print(avgPak)

0.356009405914


In [40]:
auc_br = roc_auc_score(Y_test, Y_pred, average='samples')
print(auc_br)

0.872421427693


In [41]:
perf_dict_test = {'F1_sample': F1_sample,
                  'F1_label': F1_label,
                  'Precision@K': avgPak,
                  'AUC_sample': auc_br,
                 }

In [42]:
fname = os.path.join(data_dir, 'perf-br.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))

{'bibtex': {'Test': {'F1_sample': 0.37869864830540401, 'F1_label': 0.30097928695767068, 'Precision@K': 0.43144043569315471, 'AUC_sample': 0.85314482517322965}}, 'bookmarks': {'Test': {'F1_sample': 0.29520247742393496, 'F1_label': 0.2100303975414152, 'Precision@K': 0.35600940591394675, 'AUC_sample': 0.87242142769280662}}}


## Evaluate P-Classification

In [43]:
fname = os.path.join(data_dir, 'pc-' + dataset_name + '-f1.pkl')
pc = pkl.load(open(fname, 'rb'))

In [44]:
Y_pred = pc.decision_function(X_test)

In [45]:
F1_sample = np.mean(f1_score_nowarn(Y_test, Y_pred >= pc.best_threshold, average='samples'))
print(F1_sample)

0.376603969249


In [46]:
F1_label = np.mean(f1_score_nowarn(Y_test, Y_pred >= pc.best_threshold, average='macro'))
print(F1_label)

0.283567306161


In [47]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), Y_pred)
avgPak = np.mean(pak)
print(avgPak)

0.422562234028


In [48]:
auc_pc = roc_auc_score(Y_test, Y_pred, average='samples')
print(auc_pc)

0.917518024646


In [49]:
perf_dict_test = {'F1_sample': F1_sample,
                  'F1_label': F1_label,
                  'Precision@K': avgPak,
                  'AUC_sample': auc_pc,
                 }

In [50]:
fname = os.path.join(data_dir, 'perf-pc.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))

{'bibtex': {'Test': {'F1_sample': 0.47015144430032674, 'F1_label': 0.38779796645363779, 'Precision@K': 0.51331807192145162, 'AUC_sample': 0.93270730846959027}}, 'bookmarks': {'Test': {'F1_sample': 0.37660396924850337, 'F1_label': 0.28356730616129366, 'Precision@K': 0.42256223402828319, 'AUC_sample': 0.91751802464563648}}}


## Results for PRLR

Results of PRLR algorithm from [Multi-Label Learning with Posterior Regularization](https://homes.cs.washington.edu/~luheng/files/mlnlp2014_lshtz.pdf).

In [51]:
fname = os.path.join(data_dir, 'perf-prlr.pkl')

In [52]:
perf_dict = {
    'bibtex': {'Test': {'F1_sample': 0.442, 'F1_label': 0.372,'Precision@K': np.nan, 'AUC_sample': np.nan}}, 
    'bookmarks': {'Test': {'F1_sample': 0.349, 'F1_label': 0.230,'Precision@K': np.nan,'AUC_sample': np.nan}},}

In [53]:
pkl.dump(perf_dict, open(fname, 'wb'))
print(pkl.load(open(fname, 'rb')))

{'bibtex': {'Test': {'F1_sample': 0.442, 'F1_label': 0.372, 'Precision@K': nan, 'AUC_sample': nan}}, 'bookmarks': {'Test': {'F1_sample': 0.349, 'F1_label': 0.23, 'Precision@K': nan, 'AUC_sample': nan}}}


## Generate results table

In [54]:
algos = [('br', 'BR'), 
         ('prlr', 'PRLR~\cite{lin2014multi}'), 
         ('spen', 'SPEN~\cite{belanger2016structured}'), 
         ('dvn', 'DVN~\cite{gygli2017deep}'), 
         ('pc', 'PC (Ours)')]
dataset = [('bibtex', '\textbf{bibtex}'), ('bookmarks', '\textbf{bookmarks}')]
metrics = [('F1_sample', 'F$_1$ Example'), ('F1_label', 'F$_1$ Label'), ('AUC_sample', 'AUC')]
           #('Precision@K', 'Precision@K')]

In [55]:
fperf = [os.path.join(data_dir, 'perf-' + algo + '.pkl') for algo in [t[0] for t in algos]]
perfs = [pkl.load(open(f, 'rb')) for f in fperf]

In [56]:
rows = [t[1] for t in algos]
cols = pd.MultiIndex.from_product([[t[1] for t in dataset], [t[1] for t in metrics]])

In [57]:
df_test = pd.DataFrame(index=rows, columns=cols)

In [58]:
for ix in range(len(perfs)):
    perf = perfs[ix]
    row = rows[ix]
    for jx in range(len(dataset)):
        dat = dataset[jx][0]
        dat_jx = dataset[jx][1]
        for kx in range(len(metrics)):
            metric = metrics[kx][0]
            metric_kx = metrics[kx][1]
            df_test.loc[row, (dat_jx, metric_kx)] = 100 * perf[dat]['Test'][metric]        

In [59]:
df_test

Unnamed: 0_level_0,\textbf{bibtex},\textbf{bibtex},\textbf{bibtex},\textbf{bookmarks},\textbf{bookmarks},\textbf{bookmarks}
Unnamed: 0_level_1,F$_1$ Example,F$_1$ Label,AUC,F$_1$ Example,F$_1$ Label,AUC
BR,37.8699,30.0979,85.3145,29.5202,21.003,87.2421
PRLR~\cite{lin2014multi},44.2,37.2,,34.9,23.0,
SPEN~\cite{belanger2016structured},41.324,33.6584,92.5931,35.5394,24.0944,90.8464
DVN~\cite{gygli2017deep},44.7005,32.4213,86.7388,37.1556,23.6748,76.8789
PC (Ours),47.0151,38.7798,93.2707,37.6604,28.3567,91.7518


In [60]:
tab_test = df_test.to_latex(float_format=lambda x: '$%.1f$' % x, na_rep='N/A',
                            column_format='l*{6}{c}', multicolumn=True, multicolumn_format='c', escape=False)

In [61]:
#df_test.to_latex?

In [62]:
print('\\begin{table}[!h]')
print('\centering')
print('\\caption{Performance on multi-label dataset}')
print('\\label{tab:perf_mlc}')    
print(tab_test)
print('\\end{table}')

\begin{table}[!h]
\centering
\caption{Performance on multi-label dataset}
\label{tab:perf_mlc}
\begin{tabular}{l*{6}{c}}
\toprule
{} & \multicolumn{3}{c}{\textbf{bibtex}} & \multicolumn{3}{c}{\textbf{bookmarks}} \\
{} &   F$_1$ Example & F$_1$ Label &    AUC &      F$_1$ Example & F$_1$ Label &    AUC \\
\midrule
BR                                 &          $37.9$ &      $30.1$ & $85.3$ &             $29.5$ &      $21.0$ & $87.2$ \\
PRLR~\cite{lin2014multi}           &          $44.2$ &      $37.2$ &    N/A &             $34.9$ &      $23.0$ &    N/A \\
SPEN~\cite{belanger2016structured} &          $41.3$ &      $33.7$ & $92.6$ &             $35.5$ &      $24.1$ & $90.8$ \\
DVN~\cite{gygli2017deep}           &          $44.7$ &      $32.4$ & $86.7$ &             $37.2$ &      $23.7$ & $76.9$ \\
PC (Ours)                          &          $47.0$ &      $38.8$ & $93.3$ &             $37.7$ &      $28.4$ & $91.8$ \\
\bottomrule
\end{tabular}

\end{table}
