# Multi-label classification -- top-push loss

In [None]:
%matplotlib inline
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd

from scipy.optimize import minimize
from scipy.optimize import check_grad

from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sys.path.append('src')
from evaluate import avgPrecision, avgPrecisionK, printEvaluation
from datasets import create_dataset_yeast_train, create_dataset_yeast_test, yeast_nLabels
from datasets import create_dataset_scene_train, create_dataset_scene_test, scene_nLabels
from datasets import create_dataset_mediamill_subset_train, create_dataset_mediamill_subset_test, mm_nLabels

In [None]:
datasets = ['yeast', 'scene', 'mediamill']
num_labels = [yeast_nLabels, scene_nLabels, mm_nLabels]
create_dataset_train_funcs = [create_dataset_yeast_train, 
                              create_dataset_scene_train, 
                              create_dataset_mediamill_subset_train]
create_dataset_test_funcs  = [create_dataset_yeast_test,
                              create_dataset_scene_test,
                              create_dataset_mediamill_subset_test]

In [None]:
data_ix = 0

In [None]:
dataset_name = datasets[data_ix]
nLabels = num_labels[data_ix]
create_dataset_train = create_dataset_train_funcs[data_ix]
create_dataset_test  = create_dataset_test_funcs [data_ix]

The sigmoid function.

In [None]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

## top-push loss

Multi-label learning with top push loss.

In [None]:
def obj_top_push(w, X, Y, C):
    """
        Objective with L2 regularisation and top push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == 2 * N * K)
    assert(C > 0)
    
    A = w[:N * K].reshape(N, K)  # alpha
    B = w[N * K:].reshape(N, K)  # beta
        
    J = 0.0  # cost
    GA = np.zeros_like(A)  # gradient matrix
    GB = np.zeros_like(B)  # gradient matrix
    nPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    nNegAll = K - nPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        a  = A[:, k]  # alpha
        b  = B[:, k]  # beta
        Yk = Y[:, k]
        
        posVec = np.zeros(N)
        negVec = np.zeros(N)
        posVec[Yk == 1] = 1
        negVec[Yk == 0] = 1
        posVec = np.divide(posVec, nPosAll) / N
        negVec = np.divide(negVec, nNegAll) / N
        
        c1 = np.multiply(a, posVec)
        c2 = np.multiply(b, negVec)
        c = c1 + c2  # gamma
        d = np.sum(X * c[:, None], axis=0)
        a1 = a[Yk == 1]
        t1 = -np.log(-a1)
        t2 = np.log(1 + a1)
        t3 = np.multiply(a1, t1) + np.multiply(1+a1, t2)
        p1 = posVec[Yk == 1]
        
        J += 0.5 * C * np.dot(d, d) + np.dot(p1, t3)
        
        T1 = X * c1[:, None]
        T2 = X * c2[:, None]
        GA = GA + C * np.dot(T1.flatten(), X.flatten()) + np.dot(p1, t1) + np.dot(p1, t2)
        GB = GB + C * np.dot(T2.flatten(), X.flatten())
        
    return (J, np.concatenate([GA.ravel(), GB.ravel()]))

In [None]:
def dual2primal(w, X, Y, C):
    """
        Compute primal variable values given dual variable values
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == 2 * N * K)
    assert(C > 0)
    
    W = np.zeros(K, D)
    A = w[:N * K].reshape(N, K)  # alpha
    B = w[N * K:].reshape(N, K)  # beta
    
    nPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    nNegAll = L - nPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        a  = A[:, k]  # alpha
        b  = B[:, k]  # beta
        Yk = Y[:, k]
        
        posVec = np.zeros(N)
        negVec = np.zeros(N)
        posVec[Yk == 1] = 1
        negVec[Yk == 0] = 1
        posVec = np.divide(posVec, nPosAll) / N
        negVec = np.divide(negVec, nNegAll) / N
        
        c = np.multiply(a, posVec) + np.multiply(b, negVec)  # gamma
        W[k, :] = -C * np.sum(X * c[:, None], axis=0)
        
    return W

In [None]:
def initVar(X, Y, seed=None):
    """
        Initialise the dual variables
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
    """
    N, D = X.shape
    K = Y.shape[1]
    
    if seed is not None: 
        np.random.seed(seed)
    
    return np.concatenate([-np.random.rand(N * K), np.random.rand(N * K)])

Check gradient

In [None]:
X_train, Y_train = create_dataset_train()
X_test,  Y_test  = create_dataset_test()

In [None]:
X_train.shape

In [None]:
#%%script false
C = 1
w0 = initVar(X_train, Y_train)
J, G = obj_top_push(w0, X_train, Y_train, C)
w1 = w0.copy()
w1[0] = w1[5] + 10**(5)
J1, G1 = obj_top_push(w1, X_train, Y_train, C)
print(np.dot(w1-w0, w1-w0))
print(J1 - J, J, J1)

In [None]:
#%%script false
#C = 1  # if C is lambda
#C = 1/X_train.shape[0]
#C = 1
#C = 10**(-6)
#C = 10**(6)
C = 10
p = 1
w0 = np.random.rand(X_train.shape[1] * nLabels)
#%lprun -f obj_top_push check_grad(lambda w: obj_top_push(w, X_train, Y_train, p, C)[0], \
#                                  lambda w: obj_top_push(w, X_train, Y_train, p, C)[1], w0)

In [None]:
class MLC_toppush(BaseEstimator):
    """All methods are necessary for a scikit-learn estimator"""
    
    def __init__(self, p=1, C=1):
        """Initialisation"""
        
        assert C > 0
        self.C = C
        self.trained = False
        
    def fit(self, X_train, Y_train):
        """Model fitting by optimising the objective"""
        
        opt_method = 'SLSQP'
        options = {'disp': True}
        if options['disp']: 
            print('\nC: %g' % self.C)
            
        D = X_train.shape[1]
        L = Y_train.shape[1]
        w0 = np.random.rand(L * D)  # initial guess
        opt = minimize(obj_top_push, w0, args=(X_train, Y_train, self.C), \
                       method=opt_method, jac=True, options=options)
        if opt.success is True:
            self.W = dual2primal(opt.x, X_train, Y_train, self.C)
            self.trained = True
        else:
            sys.stderr.write('Optimisation failed')
            self.trained = False
            
            
    def decision_function(self, X_test):
        """Make predictions (score is real number)"""
        
        assert self.trained is True, "Can't make prediction before training"
        D = X_test.shape[1]
        return np.dot(X_test, self.W.T)
        
    
    def predict(self, X_test):
        """Make predictions (score is boolean)"""
        
        preds = self.decision_function(X_test)
        return (preds > 0)
    
    
    def score(self, X, Y):
        """Compute scoring metric"""
        
        allPreds = self.decision_function(X)
        return avgPrecisionK(Y, allPreds)
    
    # inherit from BaseEstimator instead of re-implement
    #
    #def get_params(self, deep = True):
    #def set_params(self, **params):

In [None]:
parameters = [{'C': [10**(e) for e in range(-6,-1)]}]

clf = GridSearchCV(MLC_toppush(), parameters, cv=5)
clf.fit(X_train, Y_train)

print("\nBest parameters set found on development set:")
print(clf.best_params_)

In [None]:
for mean, std, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], \
                             clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
preds_train = clf.decision_function(X_train)
preds_test  = clf.decision_function(X_test)

In [None]:
print('Training set:')
printEvaluation(Y_train, preds_train)
print()
print('Test set:')
printEvaluation(Y_test, preds_test)

## Result analysis

In [None]:
precisions_train = [avgPrecision(Y_train, preds_train, k) for k in range(1, nLabels+1)]
precisions_test  = [avgPrecision(Y_test,  preds_test,  k) for k in range(1, nLabels+1)]

In [None]:
precisionK_train = avgPrecisionK(Y_train, preds_train)
precisionK_test  = avgPrecisionK(Y_test,  preds_test)

In [None]:
plt.figure(figsize=[10,5])
plt.plot(precisions_train, ls='--', c='r', label='Train')
plt.plot(precisions_test,  ls='-',  c='g', label='Test')
plt.plot([precisionK_train for k in range(nLabels)], ls='-', c='r', label='Train, Precision@K')
plt.plot([precisionK_test  for k in range(nLabels)], ls='-', c='g', label='Test, Precision@K')
plt.xticks(np.arange(nLabels), np.arange(1,nLabels+1))
plt.xlabel('k')
plt.ylabel('Precision@k')
plt.legend(loc='best')
plt.title('MLC w. Top-push Loss on ' + dataset_name + ' dataset')
plt.savefig(dataset_name + '_tp.svg')