# Multi-label classification -- top-push loss

In [1]:
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd

from scipy.optimize import minimize
from scipy.optimize import check_grad
from scipy.misc import logsumexp

from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sys.path.append('src')
from evaluate import avgPrecision, avgPrecisionK, printEvaluation
from datasets import create_dataset_yeast_train, create_dataset_yeast_test, yeast_nLabels
from datasets import create_dataset_emotions_train, create_dataset_emotions_test, emotions_nLabels
from datasets import create_dataset_scene_train, create_dataset_scene_test, scene_nLabels
from datasets import create_dataset_mediamill_subset_train, create_dataset_mediamill_subset_test, mm_nLabels

In [3]:
datasets = ['yeast', 'emotions', 'scene', 'mediamill']
num_labels = [yeast_nLabels, emotions_nLabels, scene_nLabels, mm_nLabels]
create_dataset_train_funcs = [create_dataset_yeast_train, 
                              create_dataset_emotions_train,
                              create_dataset_scene_train, 
                              create_dataset_mediamill_subset_train]
create_dataset_test_funcs  = [create_dataset_yeast_test,
                              create_dataset_emotions_test,
                              create_dataset_scene_test,
                              create_dataset_mediamill_subset_test]

In [4]:
data_ix = 0

In [5]:
dataset_name = datasets[data_ix]
nLabels = num_labels[data_ix]
create_dataset_train = create_dataset_train_funcs[data_ix]
create_dataset_test  = create_dataset_test_funcs [data_ix]
print('Dataset:', dataset_name)

Dataset: yeast


The sigmoid function.

In [6]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

## top-push loss

Multi-label learning with top push loss.

In [7]:
np.dot(np.ones(3)*2, np.array([[1,2],[3,4],[5,6]]))

array([ 18.,  24.])

In [8]:
np.dot(np.array([[1,2],[3,4],[5,6]]), np.ones(2)*2)

array([  6.,  14.,  22.])

In [9]:
def obj_toppush(w, X, Y, C, r=1):
    """
        Objective with L2 regularisation and top push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, C = 1 / lambda
            - r: parameter for log-sum-exp approximation
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == K * D)
    assert(r > 0)
    assert(C > 0)
    
    W = w.reshape(K, D)  # theta
    
    J = 0.0  # cost
    G = np.zeros_like(W)  # gradient matrix
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    
    T1 = np.dot(X, W.T)   # N by K
    T2 = np.multiply(1-Y, np.exp(T1 * r))  # N by K
    T3 = np.power(np.sum(T2, axis=1), 1.0/r)  # N by 1
    T4 = np.exp(-T1) * T3[:, None]  # N by K
    T5 = np.log1p(T4) * (1.0 / KPosAll)[:, None]  # N by K
    T6 = np.multiply(T5, Y)  # N by K
    
    J = np.dot(w, w) * 0.5 / C + np.dot(np.ones(N), np.dot(T6, np.ones(K))) / N
    
    T7 = 1 / ((1 + 1/T4) * KPosAll[:, None])  # N by K
    T8 = np.multiply(T7, Y) / N  # N by K
    
    #for k in range(K):
    #    G[k, :] = np.sum(X * T8[:, k][:, None], axis=0)
    
    G = W / C + np.dot(T8.T, X)
    
    return (J, G.ravel())

In [21]:
def obj_toppush_loop(w, X, Y, C, r=1):
    """
        Objective with L2 regularisation and top push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, C = 1 / lambda
            - r: parameter for log-sum-exp approximation
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == K * D)
    assert(r > 0)
    assert(C > 0)
    
    W = w.reshape(K, D)  # theta
    
    J = 0.0  # cost
    G = np.zeros_like(W)  # gradient matrix
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    
    for n in range(N):
        for k in range(K):
            if Y[n, k] == 1:
                s1 = np.sum([np.exp(r * np.dot(W[j, :] - W[k, :], X[n, :])) for j in range(K) if Y[n, j] == 0])
                J += np.log1p(np.power(s1, 1.0 / r)) / KPosAll[n]
    J = np.dot(w, w) * 0.5 / C + J / N
    
    for k in range(K):
        for n in range(N):
            if Y[n, k] == 1:
                t1 = np.sum([np.exp(r * np.dot(W[j, :] - W[k, :], X[n, :])) for j in range(K) if Y[n, j] == 0])
                t2 = -1.0 / (1 + np.power(t1, -1.0 / r))
                G[k, :] = G[k, :] + X[n, :] * t2 / KPosAll[n]
            else:
                sk = 0.0
                for k1 in range(K):
                    if Y[n, k1] == 1:
                        t3 = np.sum([np.exp(r * np.dot(W[j,:] - W[k1, :], X[n, :])) \
                                     for j in range(K) if Y[n, j] == 0])
                        t4 = np.exp(r * np.dot(W[k, :] - W[k1, :], X[n, :]))
                        sk += t4 / (np.power(t3, 1.0 - 1.0 / r) + t3)
                G[k, :] = G[k, :] + X[n, :] * sk / KPosAll[n]
                        
    G = W / C + G / N
    
    return (J, G.ravel())

Check gradient

In [12]:
#aa = np.array([0,1,2, 0])
#print(aa)
#print([aa[i] for i in range(4) if aa[i] != 0])
#print([aa[i] if aa[i] != 0 else 10 for i in range(4)])

In [38]:
X_train, Y_train = create_dataset_train()
X_test,  Y_test  = create_dataset_test()

In [39]:
#X_train = X_train[:100, :]
#Y_train = Y_train[:100, :]

In [40]:
X_train.shape

(1500, 103)

In [41]:
C = 1
w0 = np.random.rand(Y_train.shape[1] * X_train.shape[1])

In [42]:
%%script false
check_grad(lambda w: obj_toppush(w, X_train, Y_train, C)[0], 
           lambda w: obj_toppush(w, X_train, Y_train, C)[1], w0)

In [43]:
#%%script false
eps = 1.49e-08
w = np.zeros_like(w0)
for i in range(len(w0)):
    wi1 = w0.copy()
    wi2 = w0.copy()
    wi1[i] = wi1[i] - eps
    wi2[i] = wi2[i] + eps
    J1, _ = obj_toppush_loop(wi1, X_train, Y_train, C)
    J2, _ = obj_toppush_loop(wi2, X_train, Y_train, C)
    w[i] = (J2 - J1) / (2 * eps)
    #print(w[i])
J, w1 = obj_toppush_loop(w0, X_train, Y_train, C)
diff = w1 - w
np.sqrt(np.dot(diff, diff))

3.1925345092963838e-05

In [None]:
%%script false
check_grad(lambda w: obj_toppush_loop0(w, X_train, Y_train, C)[0], 
           lambda w: obj_toppush_loop0(w, X_train, Y_train, C)[1], w0)

In [None]:
#%%script false
check_grad(lambda w: obj_toppush_loop(w, X_train, Y_train, C)[0], 
           lambda w: obj_toppush_loop(w, X_train, Y_train, C)[1], w0)

In [None]:
%%script false
print('%15s %15s %15s %15s' % ('J_Diff', 'J_loop', 'J_vec', 'G_Diff'))
for e in range(-6, 10):
    C = 10**(e)
    #w0 = init_var(X_train, Y_train)
    J,  G  = obj_toppush_loop(w0, X_train, Y_train, C)
    J1, G1 = obj_toppush(w0, X_train, Y_train, C)
    Gdiff = G1 - G
    #print('%-15g %-15g %-15g' % (J1 - J, J, J1))
    print('%15g %15g %15g %15g' % (J1 - J, J, J1, np.dot(Gdiff, Gdiff)))

In [None]:
def minimise_cost(X, Y, C):
    #opt_func = minimise_cost_
    return opt_func(X, Y, C)

class MLC_toppush(BaseEstimator):
    """All methods are necessary for a scikit-learn estimator"""
    
    def __init__(self, p=1, C=1):
        """Initialisation"""
        
        assert C > 0
        self.C = C
        self.trained = False
        
    def fit(self, X_train, Y_train):
        """Model fitting by optimising the objective"""
        optx, success = minimise_cost(X_train, Y_train, self.C)
        if success is True:
            self.W = dual2primal(optx, X_train, Y_train, self.C)
            self.trained = True
        else:
            self.trained = False
            
            
    def decision_function(self, X_test):
        """Make predictions (score is real number)"""
        
        assert self.trained is True, "Can't make prediction before training"
        D = X_test.shape[1]
        return np.dot(X_test, self.W.T)
        
    
    def predict(self, X_test):
        """Make predictions (score is boolean)"""
        
        preds = self.decision_function(X_test)
        return (preds > 0)
    
    
    def score(self, X, Y):
        """Compute scoring metric"""
        
        allPreds = self.decision_function(X)
        return avgPrecisionK(Y, allPreds)
    
    # inherit from BaseEstimator instead of re-implement
    #
    #def get_params(self, deep = True):
    #def set_params(self, **params):

In [None]:
#%mprun
model = MLC_toppush()
model.fit(X_train[:50], Y_train[:50])
#%memit model.fit(X_train[:30], Y_train[:30])
#%mprun -f minimize model.fit(X_train[:100], Y_train[:100])
#%mprun -f _minimize_slsqp model.fit(X_train[:10], Y_train[:10])

In [None]:
parameters = [{'C': [10**(e) for e in range(-6,-1)]}]

clf = GridSearchCV(MLC_toppush(), parameters, cv=5, n_jobs=1)
clf.fit(X_train[:100], Y_train[:100])

print("\nBest parameters set found on development set:")
print(clf.best_params_)

In [None]:
for mean, std, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], \
                             clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
clf = model

In [None]:
preds_train = clf.decision_function(X_train)
preds_test  = clf.decision_function(X_test)

In [None]:
print('Training set:')
printEvaluation(Y_train, preds_train)
print()
print('Test set:')
printEvaluation(Y_test, preds_test)

## Result analysis

In [None]:
precisions_train = [avgPrecision(Y_train, preds_train, k) for k in range(1, nLabels+1)]
precisions_test  = [avgPrecision(Y_test,  preds_test,  k) for k in range(1, nLabels+1)]

In [None]:
precisionK_train = avgPrecisionK(Y_train, preds_train)
precisionK_test  = avgPrecisionK(Y_test,  preds_test)

In [None]:
plt.figure(figsize=[10,5])
plt.plot(precisions_train, ls='--', c='r', label='Train')
plt.plot(precisions_test,  ls='-',  c='g', label='Test')
plt.plot([precisionK_train for k in range(nLabels)], ls='-', c='r', label='Train, Precision@K')
plt.plot([precisionK_test  for k in range(nLabels)], ls='-', c='g', label='Test, Precision@K')
plt.xticks(np.arange(nLabels), np.arange(1,nLabels+1))
plt.xlabel('k')
plt.ylabel('Precision@k')
plt.legend(loc='best')
plt.title('MLC w. Top-push Loss on ' + dataset_name + ' dataset')
plt.savefig(dataset_name + '_tp.svg')