# Multi-label classification -- binary relevance baseline

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sys.path.append('src')
from evaluate import avgPrecision, avgPrecisionK, evaluationPrecision, evaluationF1
from datasets import create_dataset, dataset_names, nLabels_dict

In [3]:
dataset_names

['yeast', 'scene', 'bibtex', 'bookmarks', 'delicious', 'mediamill']

In [4]:
data_ix = 3

In [5]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)

bookmarks 208


In [6]:
data_dir = 'data'
fmodel = os.path.join(data_dir, 'lr-' + dataset_name + '.pkl')
fperf  = os.path.join(data_dir, 'perf-lr.pkl')

## Binary relevance baseline

Train a logistic regression model for each label.

In [None]:
X_train, Y_train = create_dataset(dataset_name=dataset_name, train_data=True)
X_test,  Y_test  = create_dataset(dataset_name=dataset_name, train_data=False)

In [None]:
allPreds_train  = [ ]
allPreds_test  = [ ]
allTruths_train = [ ]
allTruths_test = [ ]
coefMat = [ ]
labelIndices = [ ]

parameters = [{'C': [10**(e) for e in range(-6,7)]}]
scoring = 'average_precision' # 'accuracy' #'precision_macro'

for label_ix in range(nLabels):
    print('Training for Label %d' % (label_ix+1))
    
    y_train = Y_train[:, label_ix]
    y_test  = Y_test [:, label_ix]
    
    allTruths_train.append(y_train)
    allTruths_test.append(y_test) 
    
    assert( (not np.all(y_train == 0)) and (not np.all(y_train == 1)) )
    
    # search for a baseline with test F1 on bibtex 0.372 (Lin et al.)
    
    # test F1 on bibtex: 0.3730
    #clf = LogisticRegression(C=100)
    
    # test F1 on bibtex: 0.4282
    #clf = GridSearchCV(LogisticRegression(class_weight='balanced'), parameters, cv=5, scoring=scoring)
    
    # test F1 on bibtex: < 0.3
    #clf = LogisticRegression()  
    
    # test F1 on bibtex: 0.4342
    #clf = LogisticRegression(class_weight='balanced') 
    
    # test F1 on bibtex: 0.3018
    #clf = GridSearchCV(LogisticRegression(), parameters, cv=5, scoring=scoring)
    
    # test F1 on bibtex: 0.3139
    #clf = GridSearchCV(LogisticRegression(), parameters, scoring=scoring)
    
    # test F1 on bibtex: 0.4252
    #clf = GridSearchCV(LogisticRegression(class_weight='balanced'), parameters, scoring=scoring)
    
    # test F1 on bibtex: 0.3598
    #clf = LogisticRegression(C=10) 
    
    # test F1 on bibtex: 0.3670
    #clf = LogisticRegression(C=30)
    
    clf = GridSearchCV(LogisticRegression(class_weight='balanced'), parameters, cv=5, scoring=scoring)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()
    
    allPreds_train.append(clf.decision_function(X_train))
    allPreds_test.append(clf.decision_function(X_test))

Training for Label 1
Best parameters set found on development set:
{'C': 0.001}

Training for Label 2
Best parameters set found on development set:
{'C': 0.01}

Training for Label 3
Best parameters set found on development set:
{'C': 0.001}

Training for Label 4
Best parameters set found on development set:
{'C': 0.01}

Training for Label 5
Best parameters set found on development set:
{'C': 0.001}

Training for Label 6
Best parameters set found on development set:
{'C': 0.01}

Training for Label 7
Best parameters set found on development set:
{'C': 0.001}

Training for Label 8
Best parameters set found on development set:
{'C': 0.01}

Training for Label 9
Best parameters set found on development set:
{'C': 0.01}

Training for Label 10
Best parameters set found on development set:
{'C': 0.001}

Training for Label 11
Best parameters set found on development set:
{'C': 0.001}

Training for Label 12
Best parameters set found on development set:
{'C': 0.001}

Training for Label 13
Best par

In [None]:
allTruths_train = np.array(allTruths_train).T
allTruths_test = np.array(allTruths_test).T

allPreds_train  = np.array(allPreds_train).T
allPreds_test  = np.array(allPreds_test).T

print(allPreds_test.shape)
print(allTruths_test.shape)

In [None]:
#allPreds_test[0]

In [None]:
print('Training set:')
perf_dict_train = evaluationPrecision(allTruths_train, allPreds_train)
print()
print('Test set:')
perf_dict_test = evaluationPrecision(allTruths_test, allPreds_test)

In [None]:
print('Training set:')
f1_train = evaluationF1(allTruths_train, allPreds_train >= 0.5)
print()
print('Test set:')
f1_test = evaluationF1(allTruths_test, allPreds_test >= 0.5)

In [None]:
perf_dict_train.update(f1_train)
perf_dict_test.update(f1_test)

In [None]:
print(perf_dict_train)
print(perf_dict_test)

In [None]:
fperf

In [None]:
perf_dict = {'Train': perf_dict_train, 'Test': perf_dict_test}
if os.path.exists(fperf):
    _dict = pkl.load(open(fperf, 'rb'))
    if dataset_name not in _dict:
        _dict[dataset_name] = perf_dict
else:
    _dict = {dataset_name: perf_dict}
pkl.dump(_dict, open(fperf, 'wb'))

In [None]:
pkl.load(open(fperf, 'rb'))

## Result analysis

In [None]:
#coefMat = np.array(coefMat).T
#coefMat.shape
#sns.heatmap(coefMat[:, :30])

In [None]:
precisions_train = [avgPrecision(allTruths_train, allPreds_train, k) for k in range(1, nLabels+1)]
precisions_test  = [avgPrecision(allTruths_test,  allPreds_test,  k) for k in range(1, nLabels+1)]

In [None]:
precisionK_train = avgPrecisionK(allTruths_train, allPreds_train)
precisionK_test  = avgPrecisionK(allTruths_test,  allPreds_test)

In [None]:
plt.figure(figsize=[10,5])
plt.plot(precisions_train, ls='--', c='r', label='Train')
plt.plot(precisions_test,  ls='-',  c='g', label='Test')
plt.plot([precisionK_train for k in range(nLabels)], ls='-', c='r', label='Train, Precision@K')
plt.plot([precisionK_test  for k in range(nLabels)], ls='-', c='g', label='Test, Precision@K')
plt.xticks(np.arange(nLabels), np.arange(1,nLabels+1))
plt.xlabel('k')
plt.ylabel('Precision@k')
plt.legend(loc='best')
plt.title('Independent Logistic Regression on ' + dataset_name + ' dataset')
plt.savefig(dataset_name + '_lr.svg')