# Multi-label classification -- top-push loss

In [1]:
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd

from scipy.sparse import coo_matrix
from scipy.optimize import minimize
from scipy.optimize import check_grad
from scipy.optimize.slsqp import _minimize_slsqp
import nlopt
import ipopt

from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sys.path.append('src')
from evaluate import avgPrecision, avgPrecisionK, printEvaluation
from datasets import create_dataset_yeast_train, create_dataset_yeast_test, yeast_nLabels
from datasets import create_dataset_emotions_train, create_dataset_emotions_test, emotions_nLabels
from datasets import create_dataset_scene_train, create_dataset_scene_test, scene_nLabels
from datasets import create_dataset_mediamill_subset_train, create_dataset_mediamill_subset_test, mm_nLabels

In [3]:
datasets = ['yeast', 'emotions', 'scene', 'mediamill']
num_labels = [yeast_nLabels, emotions_nLabels, scene_nLabels, mm_nLabels]
create_dataset_train_funcs = [create_dataset_yeast_train, 
                              create_dataset_emotions_train,
                              create_dataset_scene_train, 
                              create_dataset_mediamill_subset_train]
create_dataset_test_funcs  = [create_dataset_yeast_test,
                              create_dataset_emotions_test,
                              create_dataset_scene_test,
                              create_dataset_mediamill_subset_test]

In [4]:
data_ix = 1

In [5]:
dataset_name = datasets[data_ix]
nLabels = num_labels[data_ix]
create_dataset_train = create_dataset_train_funcs[data_ix]
create_dataset_test  = create_dataset_test_funcs [data_ix]
print('Dataset:', dataset_name)

Dataset: emotions


The sigmoid function.

In [6]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

## top-push loss

Multi-label learning with top push loss.

In [7]:
def obj_top_push(w, X, Y, C):
    """
        Objective with L2 regularisation and top push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == N * K)
    assert(C > 0)
    
    T = w.reshape(N, K)  # theta
        
    J = 0.0  # cost
    G = np.zeros_like(T)  # gradient matrix
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        y = Y[:, k]
        
        posVec = np.zeros(N, dtype=np.float)
        negVec = np.zeros(N, dtype=np.float)
        posVec[y == 1] = 1
        negVec[y == 0] = 1
        posVec = np.divide(posVec, KPosAll * N)  # 1 / NK+, N by 1
        negVec = np.divide(negVec, KNegAll * N)  # 1 / NK-, N by 1
        
        a = np.multiply(T[:, k], posVec)  # alpha / NK+ if y_nk = 1 else 0
        b = np.multiply(T[:, k], negVec)  # beta  / NK- if y_nk = 0 else 0
        c = a + b  # gamma
        d = np.sum(X * c[:, None], axis=0)
        
        t1 = T[y == 1, k]
        p1 = posVec[y == 1]
        s1 = -np.log(-t1)
        s2 = np.log(1 + t1)
        J += 0.5 * C * np.dot(d, d) + np.dot(a[y == 1], s1) + np.dot(p1, np.multiply(1+t1, s2))
        
        p2 = negVec[y == 0]
        t2 = np.dot(X, d) * C
        G[y == 1, k] = np.multiply(p1, t2[y == 1] + s1 + s2)
        G[y == 0, k] = np.multiply(p2, t2[y == 0])
        
    return (J, G.ravel())

In [8]:
def obj_top_push_loop(w, X, Y, C):
    """
        Objective with L2 regularisation and top push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == N * K)
    assert(C > 0)
    
    T = w.reshape(N, K)  # theta
    
    J = 0.0  # cost
    G = np.zeros_like(T)  # gradient matrix
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        y = Y[:, k]
        
        # cost
        X1 = np.zeros_like(X)
        for n in range(N):
            t = T[n, k] / (N * KPosAll[n]) if y[n] == 1 else T[n, k] / (N * KNegAll[n])
            X1[n, :] = X[n, :] * t
        xs = np.sum(X1, axis=0)
        J += np.dot(xs, xs) * 0.5 * C
        #print(J)
        
        for n in range(N):
            if y[n] == 1:
                print(T[n,k], n, k)
                assert(T[n, k] < 0)
                assert(T[n, k] + 1 > 0)
                J += (-T[n, k] * np.log(-T[n, k]) + (1 + T[n, k]) * np.log(1 + T[n, k])) / (N * KPosAll[n])
        
        # gradient
        for n in range(N):
            if y[n] == 1:
                ga = C * np.dot(np.sum(X1, axis=0), X[n, :]) - np.log(-T[n, k]) + np.log(1 + T[n, k])
                G[n, k] = G[n, k] + ga / (N * KPosAll[n])
            else:
                gb = C * np.dot(np.sum(X1, axis=0), X[n, :])
                G[n, k] = G[n, k] + gb / (N * KNegAll[n])
        
    return (J, G.ravel())

In [9]:
def dual2primal(w, X, Y, C):
    """
        Compute primal variable values given dual variable values
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == N * K)
    assert(C > 0)
    
    W = np.zeros((K, D), dtype=np.float)
    T = w.reshape(N, K)  # theta
    
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        y = Y[:, k]
        
        posVec = np.zeros(N, dtype=np.float)
        negVec = np.zeros(N, dtype=np.float)
        posVec[y == 1] = 1
        negVec[y == 0] = 1
        posVec = np.divide(posVec, KPosAll / N)
        negVec = np.divide(negVec, KNegAll / N)
        
        t = np.multiply(T[:, k], posVec) + np.multiply(T[:, k], negVec)  # gamma
        W[k, :] = -C * np.sum(X * t[:, None], axis=0)
        
    return W

In [10]:
def dual2primal_loop(w, X, Y, C):
    """
        Compute primal variable values given dual variable values
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == N * K)
    assert(C > 0)
    
    W = np.zeros((K, D), dtype=np.float)
    T = w.reshape(N, K)  # theta
    
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        y = Y[:, k]
        X1 = X.copy()
        for n in range(N):
            t = T[n, k] / (N * KPosAll[n]) if y[n] == 1 else T[n, k] / (N * KNegAll[n])
            X1[n, :] = X1[n, :] * t
        W[k, :] = -C * np.sum(X1, axis=0)
        
    return W

In [11]:
#aa = np.array([0,1])
#-1 * (2 * aa - 1)

In [12]:
def init_var(X, Y, seed=None):
    """
        Initialise the dual variables
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
        
        Output:
            - W: matrix with random numbers
                 W_{n,k} \in (-1,0) if Y_{n,k} = 1, 
                 W_{n,k} \in (0, 1) if Y_{n,k} = 0.
    """
    N, D = X.shape
    K = Y.shape[1]
    
    I = -1 * (2 * Y - 1)
    if seed is not None: np.random.seed(seed)
    w0 = np.multiply(np.random.rand(N * K).reshape(N, K), I).ravel()
    
    return w0

In [13]:
def init_var_feasible(X, Y):
    """
        Initialise the dual variables
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
        
        Output:
            - W: matrix with real numbers that satisfy all constraints
                 W_{n,k} \in (-1,0) if Y_{n,k} = 1, 
                 W_{n,k} \in (0, 1) if Y_{n,k} = 0.
    """
    N, D = X.shape
    K = Y.shape[1]
    
    #I = -1 * (2 * Y - 1)
    #if seed is not None: np.random.seed(seed)
    #w0 = np.multiply(np.random.rand(N * K).reshape(N, K), I).ravel()
    #return w0
    
    # NOTE: 
    # simply let
    # alpha_{n,k} = -v, 0 < v < 1
    # beta_{n,k} = v
    # will satisfy the equality constraints

    I = -1 * (2 * Y - 1)
    
    return np.ravel(0.1 * I)

In [14]:
def obj_constraint(w, n, X, Y):
    """
        Compute the constraints for top push loss
        
        Input:
            - w: weights vector, N*K x 1
            - n: the n-th constraint
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
        
        Output:
            - 0 if all equality constraits satisfy (equal to 0)
            - 1 otherwise
    """
    N, D = X.shape
    K = Y.shape[1]
    T = w.reshape(N, K)  # theta
    assert 0 <= n < N
    
    KPos = np.sum(Y[n, :])  # number of positive labels for the n-th example
    KNeg = K - KPos         # number of negative labels for the n-th example
    yn = Y[n, :]
    
    return (np.sum(T[n, yn == 1]) / KPos + np.sum(T[n, yn == 0]) / KNeg) / N

In [15]:
def grad_constraint(w, n, X, Y):
    """
        Compute the gradient of constraints for top push loss
        
        Input:
            - w: weights vector, N*K x 1
            - n: the n-th constraint
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
        
        Output:
            - an array of gradients
    """
    N, D = X.shape
    K = Y.shape[1]
    T = w.reshape(N, K)  # theta
    assert 0 <= n < N
    
    KPos = np.sum(Y[n, :])  # number of positive labels for the n-th example
    KNeg = K - KPos         # number of negative labels for the n-th example
    yn = Y[n, :]
    
    pos = np.zeros(K, dtype=np.float)
    neg = np.zeros(K, dtype=np.float)
    pos[yn == 1] = 1
    neg[yn == 0] = 1
    pos = np.divide(pos, KPos / N)
    neg = np.divide(neg, KNeg / N)
        
    return pos + neg

In [196]:
class toppush(object):
    
    def __init__(self, X, Y, C):
        """
        Initialisation and computing shared data.
            Input:
                - X: feature matrix, N x D
                - Y: label matrix,   N x K
                - C: regularisation constant
        """
        self.N, self.D = X.shape
        self.K = Y.shape[1]
        assert(C > 0)
        self.C = C
        self.X = X
        self.Y = Y

        print(X.shape, Y.shape)
        KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
        print(KPosAll.shape)
        KNegAll = self.K - KPosAll        # number of negative labels for each example, N by 1
        self.coefMat = np.ones((self.N, self.K), dtype=np.float)  # v_{n,k} = 1/NK+ is y_{n,k} = 1 else 1/NK-
        
        for k in range(self.K):
            y = Y[:, k]
            posVec = np.zeros(self.N, dtype=np.float)
            negVec = np.zeros(self.N, dtype=np.float)
            posVec[y == 1] = 1
            negVec[y == 0] = 1
            posVec = np.divide(posVec, KPosAll * self.N)  # 1 / NK+, N by 1
            negVec = np.divide(negVec, KNegAll * self.N)  # 1 / NK-, N by 1
            self.coefMat[:, k] = posVec + negVec


    def objective(self, x):
        #
        # The callback for calculating the objective
        #
        w = x
        assert(w.shape[0] == self.N * self.K)
        T = w.reshape(self.N, self.K)  # theta
        J = 0.0
        Gama = np.multiply(T, self.coefMat)  # gamma
        for k in range(self.K):
            y = self.Y[:, k]
            d = np.sum(self.X * Gama[:, k][:, None], axis=0)
            a = T[y == 1, k]  # alpha
            J += 0.5 * self.C * np.dot(d, d)
            J += np.dot(self.coefMat[y == 1, k], np.multiply(-a, np.log(-a)) + np.multiply(1+a, np.log(1+a)))
        return J
    

    def gradient(self, w):
        #
        # The callback for calculating the gradient
        #
        T = w.reshape(self.N, self.K)   # theta
        Gama = np.multiply(T, self.coefMat)  # gamma
        G = np.zeros_like(T)  # gradient matrix
        for k in range(self.K):
            y = self.Y[:, k]
            a = T[y == 1, k]  # alpha
            d = np.sum(self.X * Gama[:, k][:, None], axis=0)
            t = np.dot(self.X, d) * self.C
            G[y == 1, k] = np.multiply(self.coefMat[y == 1, k], t[y == 1] - np.log(-a) + np.log(1+a))
            G[y == 0, k] = np.multiply(self.coefMat[y == 0, k], t[y == 0])
        return G.ravel()
    

    def constraints(self, w):
        #
        # The callback for calculating the constraints
        #
        T = w.reshape(self.N, self.K)   # theta
        Gama = np.multiply(T, self.coefMat)  # gamma
        return np.sum(Gama, axis=1)
    

    def jacobian(self, w):
        #
        # The callback for calculating the Jacobian of constraints
        #
        jac = np.tile(np.zeros((self.N, self.K)), (N,1))
        ix = [n * (self.N + 1) for n in range(self.N)]  # [0, N+1, 2N+2, ..., N^2-1]
        jac[ix, :] = self.coefMat
        return jac.ravel()

    
    def hessian(self, w, lagrange_multipliers, obj_factor):
        #
        # The callback for calculating the Hessian of both objective and constraints
        #
        T1 = np.repeat(self.X, repeats=self.K, axis=0)  # repeat each row K times, NK by K
        T2 = T1 * self.coefMat.flatten()[:, None]  # scale each row in T1
        H = obj_factor * np.tril(np.dot(T2, T2.T)) # NK by NK
        
        # Linear constraints: Hession of all constraints are zeros
        return H.ravel()

    
    def intermediate(
            self,
            alg_mod,
            iter_count,
            obj_value,
            inf_pr,
            inf_du,
            mu,
            d_norm,
            regularization_size,
            alpha_du,
            alpha_pr,
            ls_trials
        ):
        #
        # Example for the use of the intermediate callback.
        #
        print("Objective value at iteration #%d is - %g" % (iter_count, obj_value))

Check gradient

In [197]:
X_train, Y_train = create_dataset_train()
X_test,  Y_test  = create_dataset_test()

In [198]:
X_train.shape

(391, 72)

In [199]:
%%script false
# testing func
def obj_func(w):
    # f = xy + x^2
    x = w[0]
    y = w[1]
    return (x*y + x*x, np.asarray([y + 2*x, x]))

eps = 1.49e-08
w0 = np.random.rand(2)
w = np.zeros_like(w0)
for i in range(len(w0)):
    wi1 = w0.copy()
    wi2 = w0.copy()
    wi1[i] = wi1[i] - eps
    wi2[i] = wi2[i] + eps
    J1, _ = obj_func(wi1)
    J2, _ = obj_func(wi2)
    w[i] = (J2 - J1) / (2 * eps)
J, w1 = obj_func(w0)
wdiff = w1 - w
print(np.dot(wdiff, wdiff))

In [200]:
%%script false
C = 1
eps = 1.49e-08
w0 = init_var(X_train[:100, :], Y_train[:100, :])
w = np.zeros_like(w0)
for i in range(len(w0)):
    wi1 = w0.copy()
    wi2 = w0.copy()
    wi1[i] = wi1[i] - eps
    wi2[i] = wi2[i] + eps
    J1, _ = obj_top_push_loop(wi1, X_train[:100, :], Y_train[:100, :], C)
    J2, _ = obj_top_push_loop(wi2, X_train[:100, :], Y_train[:100, :], C)
    w[i] = (J2 - J1) / (2 * eps)
    #print(w[i])
J, w1 = obj_top_push_loop(w0, X_train[:100, :], Y_train[:100, :], C)
diff = w1 - w
np.dot(diff, diff)

In [201]:
%%script false
C = 1
w0 = init_var(X_train[:100, :], Y_train[:100, :])
check_grad(lambda w: obj_top_push_loop(w, X_train[:100, :], Y_train[:100, :], C)[0], 
           lambda w: obj_top_push_loop(w, X_train[:100, :], Y_train[:100, :], C)[1], w0)

In [202]:
%%script false
N = X_train.shape[0]
K = Y_train.shape[1]
print('%15s %15s %15s %15s' % ('J_Diff', 'J_loop', 'J_vec', 'G_Diff'))
for e in range(-6, 10):
    C = 10**(e)
    w0 = init_var(X_train, Y_train)
    J,  G  = obj_top_push_loop(w0, X_train, Y_train, C)
    J1, G1 = obj_top_push(w0, X_train, Y_train, C)
    Gdiff = G1 - G
    #print('%-15g %-15g %-15g' % (J1 - J, J, J1))
    print('%15g %15g %15g %15g' % (J1 - J, J, J1, np.dot(Gdiff, Gdiff)))

In [203]:
%%script false
C = 1
w0 = init_var(X_train, Y_train)
check_grad(lambda w: obj_top_push(w, X_train, Y_train, C)[0], 
           lambda w: obj_top_push(w, X_train, Y_train, C)[1], w0)
#%lprun -f obj_top_push check_grad(lambda w: obj_top_push(w, X_train, Y_train, C)[0], \
#                                  lambda w: obj_top_push(w, X_train, Y_train, C)[1], w0)

In [204]:
#w0 = init_var(X_train[:100, :], Y_train[:100, :])
#check_grad(lambda w: obj_constraint(w, 0, X_train[:100, :], Y_train[:100, :]), 
#           lambda w: grad_constraint(w, 0, X_train[:100, :], Y_train[:100, :]), w0)

In [205]:
X_train.shape

(391, 72)

In [206]:
Y_train.shape

(391, 6)

In [207]:
def minimise_cost_ipopt(X, Y, C):
    """
        Minimise the cost using Interior Point method provided by ipopt
        see https://projects.coin-or.org/Ipopt for details.
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant
        
        Output:
            - optx: the optimal solution (N*K x 1) if succeed, None otherwise
            - success: True if succeed else False
    """
    N, D = X.shape
    K = Y.shape[1]
    M = N * K  # number of parameters
    
    w0 = init_var_feasible(X, Y)
    eps = 1e-12
    lb_var = np.array([-1 + eps if wi < 0 else 0 for wi in w0],      dtype=np.float64)  # lower bounds
    ub_var = np.array([ 0 - eps if wi < 0 else 2.0e19 for wi in w0], dtype=np.float64)  # upper bounds
    lb_cons = np.array([0 for n in range(N)])
    ub_cons = np.array([0 for n in range(N)])
    
    prob = ipopt.problem(
        n = len(w0), 
        m = N, 
        problem_obj = toppush(X, Y, C), 
        lb = lb_var, 
        ub = ub_var,
        cl = lb_cons,
        cu = ub_cons
    )
    #prob.addOption('mu_strategy', 'adaptive')
    #prob.addOption('tol', 1e-7)
    xopt, info = prob.solve(w0)
    
    print(info.items())
    return (xopt, True)

In [208]:
def minimise_cost_nlopt(X, Y, C):
    """
        Minimise the cost using Augmented Lagrangian algorithm and L-BFGS provided by NLopt,
        see https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/#augmented-lagrangian-algorithm for details.
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant
        
        Output:
            - optx: the optimal solution (N*K x 1) if succeed, None otherwise
            - success: True if succeed else False
    """
    N, D = X.shape
    K = Y.shape[1]
    M = N * K  # number of parameters
    
    obj_func = obj_top_push_loop
    #obj_func = obj_top_push
    
    # objective
    def _func(w, grad):
        obj, G = obj_func(w, X, Y, C)
        if grad.size > 0:
            grad[:] = G  # inplace write, eqv to np.copyto(grad, G)
        return obj
    
    # constraints
    def _cons(w, grad, n):
        if grad.size > 0:
            T = np.zeros((N, K))
            T[n, :] = grad_constraint(w, n, X, Y)
            grad[:] = T.ravel()
        return obj_constraint(w, n, X, Y)
        
    opt = nlopt.opt(nlopt.LD_AUGLAG, M)
    opt.set_min_objective(_func)  # to minimise objective function
    
    local_opt = nlopt.opt(nlopt.LD_LBFGS, M)
    opt.set_local_optimizer(local_opt)
    
    eps = 1e-12
    w0 = init_var_feasible(X, Y)
    lb = np.array([-1 + eps if wi < 0 else 0 for wi in w0],      dtype=np.float64)  # lower bounds
    ub = np.array([ 0 - eps if wi < 0 else np.inf for wi in w0], dtype=np.float64)  # upper bounds
    opt.set_lower_bounds(lb)
    opt.set_upper_bounds(ub)
    
    for n in range(N): 
        opt.add_equality_constraint(lambda x, grad: _cons(x, grad, n))
        
    opt.set_ftol_rel(2.220446049250313e-09)  # same as scipy.optimize.minimize for L-BFGS-B
    opt.set_maxtime(2 * 60 * 60)  # time limit: 2 hours
    
    try:
        xopt = opt.optimize(w0)                # optimal solution
        opt_val  = opt.last_optimum_value()    # optimal value
        res_code = opt.last_optimize_result()  # result code
        if res_code == nlopt.SUCCESS:
            return (xopt, True)
        else:
            sys.stderr.write('Optimisation failed: (result code %d)' % res)
            return (None, False)
    except:
        sys.stderr.write('Optimisation failed')
        raise

In [209]:
def minimise_cost_slsqp(X, Y, C):
    """
        Minimise the cost using SLSQP (Sequential Least SQuares Programming) algorithm
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant
        
        Output:
            - optx: the optimal solution (N*K x 1) if succeed, None otherwise
            - success: True if succeed else False
        
        NOTE: SLSQP is less practical for optimizing more than a few thousand parameters due to space and 
              time complexity, see https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/#slsqp for details.
    """

    opt_method = 'SLSQP'
    options = {'disp': True, 'maxiter': 100}
    if options['disp']: 
        print('\nC: %g' % C)

    N = X.shape[0]

    # NOTE: initial point should be feasible, otherwise optimisation will fail
    w0 = init_var_feasible(X, Y)
    #bnds = [(-1, 0) if wi < 0 else (0, None) for wi in w0]  # the min/left bound is inclusion
    eps = 1e-12
    bnds = [(-1+eps, 0-eps) if wi < 0 else (0, None) for wi in w0]
    cons = [{'type': 'eq', 'fun': obj_constraint, 'args': (n, X, Y)} for n in range(N)]
    #obj_func = obj_top_push_loop
    obj_func = obj_top_push
    opt = minimize(obj_func, w0, args=(X, Y, C), method=opt_method, jac=True, \
                   bounds=bnds, constraints=cons, options=options)
    
    #print(opt.success)
    #print(type(opt.success))
    #print(opt.success == np.True_)
    # NOTE: opt.success is of type 'numpy.bool_' which is different from python bool
    #if opt.success is True:
    if opt.success is np.True_:        
        return (opt.x, True)
    else:
        sys.stderr.write('Optimisation failed: \n')
        #print(opt.items())
        return (None, False)

In [210]:
N, K = X_train.shape

# code in scipy.optimize.slsqp._minimize_slsqp()
meq = N
mieq = 0
m = meq + mieq
la = np.array([1, m]).max()
n = N*K
n1 = n + 1
mineq = m - meq + n1 + n1
len_w = (3*n1+m)*(n1+1)+(n1-meq+1)*(mineq+2) + 2*mineq+(n1+mineq)*(n1-meq) + \
        2*meq + n1 + ((n+1)*n)//2 + 2*m + 3*n + 3*n1 + 1
print(len_w)
print(np.log2(len_w))
#e = 32
#np.zeros(2**(e))
#np.zeros(len_w)

6693489535
32.6401113851


In [211]:
def minimise_cost(X, Y, C):
    #opt_func = minimise_cost_slsqp
    #opt_func = minimise_cost_nlopt
    opt_func = minimise_cost_ipopt
    return opt_func(X, Y, C)

class MLC_toppush(BaseEstimator):
    """All methods are necessary for a scikit-learn estimator"""
    
    def __init__(self, p=1, C=1):
        """Initialisation"""
        
        assert C > 0
        self.C = C
        self.trained = False
        
    def fit(self, X_train, Y_train):
        """Model fitting by optimising the objective"""
        optx, success = minimise_cost(X_train, Y_train, self.C)
        if success is True:
            self.W = dual2primal(optx, X_train, Y_train, self.C)
            self.trained = True
        else:
            self.trained = False
            
            
    def decision_function(self, X_test):
        """Make predictions (score is real number)"""
        
        assert self.trained is True, "Can't make prediction before training"
        D = X_test.shape[1]
        return np.dot(X_test, self.W.T)
        
    
    def predict(self, X_test):
        """Make predictions (score is boolean)"""
        
        preds = self.decision_function(X_test)
        return (preds > 0)
    
    
    def score(self, X, Y):
        """Compute scoring metric"""
        
        allPreds = self.decision_function(X)
        return avgPrecisionK(Y, allPreds)
    
    # inherit from BaseEstimator instead of re-implement
    #
    #def get_params(self, deep = True):
    #def set_params(self, **params):

In [212]:
#%mprun
model = MLC_toppush()
model.fit(X_train[:10], Y_train[:10])
#%memit model.fit(X_train[:30], Y_train[:30])
#%mprun -f minimize model.fit(X_train[:100], Y_train[:100])
#%mprun -f _minimize_slsqp model.fit(X_train[:10], Y_train[:10])

(10, 72) (10, 6)
(10,)
Objective value at iteration #0 is - 7.28373
Objective value at iteration #1 is - 6.87495
Objective value at iteration #2 is - 5.75794
Objective value at iteration #3 is - 3.21841
Objective value at iteration #4 is - 2.66169
Objective value at iteration #5 is - 1.40639
Objective value at iteration #6 is - 1.13946
Objective value at iteration #7 is - 0.54494
Objective value at iteration #8 is - 0.420379
Objective value at iteration #9 is - 0.167796
Objective value at iteration #10 is - 0.105889
Objective value at iteration #11 is - 0.0864474
Objective value at iteration #12 is - 0.0348501
Objective value at iteration #13 is - 0.0188838
Objective value at iteration #14 is - -0.0229579
Objective value at iteration #15 is - -0.0357429
Objective value at iteration #16 is - -0.0687118
Objective value at iteration #17 is - -0.0787148
Objective value at iteration #18 is - -0.10389
Objective value at iteration #19 is - -0.111578
Objective value at iteration #20 is - -0.13

Objective value at iteration #189 is - -0.203014
Objective value at iteration #190 is - -0.203164
Objective value at iteration #191 is - -0.203208
Objective value at iteration #192 is - -0.203282
Objective value at iteration #193 is - -0.203304
Objective value at iteration #194 is - -0.203293
Objective value at iteration #195 is - -0.203292
Objective value at iteration #196 is - -0.203169
Objective value at iteration #197 is - -0.203149
Objective value at iteration #198 is - -0.202858
Objective value at iteration #199 is - -0.202824
Objective value at iteration #200 is - -0.202216
Objective value at iteration #201 is - -0.202191
Objective value at iteration #202 is - -0.200622
Objective value at iteration #203 is - -0.200711
Objective value at iteration #204 is - -0.164902
Objective value at iteration #205 is - -0.172229
Objective value at iteration #206 is - -0.174332
Objective value at iteration #207 is - -0.179621
Objective value at iteration #208 is - -0.181116
Objective value at i

Objective value at iteration #431 is - -0.212891
Objective value at iteration #432 is - -0.212997
Objective value at iteration #433 is - -0.213434
Objective value at iteration #434 is - -0.213563
Objective value at iteration #435 is - -0.21361
Objective value at iteration #436 is - -0.21375
Objective value at iteration #437 is - -0.213801
Objective value at iteration #438 is - -0.213954
Objective value at iteration #439 is - -0.214011
Objective value at iteration #440 is - -0.214181
Objective value at iteration #441 is - -0.214244
Objective value at iteration #442 is - -0.214434
Objective value at iteration #443 is - -0.214504
Objective value at iteration #444 is - -0.214719
Objective value at iteration #445 is - -0.214798
Objective value at iteration #446 is - -0.215042
Objective value at iteration #447 is - -0.215131
Objective value at iteration #448 is - -0.215414
Objective value at iteration #449 is - -0.215514
Objective value at iteration #450 is - -0.215869
Objective value at ite

Objective value at iteration #605 is - -0.235511
Objective value at iteration #606 is - -0.235586
Objective value at iteration #607 is - -0.235821
Objective value at iteration #608 is - -0.235905
Objective value at iteration #609 is - -0.236201
Objective value at iteration #610 is - -0.236296
Objective value at iteration #611 is - -0.236332
Objective value at iteration #612 is - -0.236438
Objective value at iteration #613 is - -0.236478
Objective value at iteration #614 is - -0.236597
Objective value at iteration #615 is - -0.236642
Objective value at iteration #616 is - -0.236776
Objective value at iteration #617 is - -0.236826
Objective value at iteration #618 is - -0.236976
Objective value at iteration #619 is - -0.237032
Objective value at iteration #620 is - -0.237202
Objective value at iteration #621 is - -0.237265
Objective value at iteration #622 is - -0.237458
Objective value at iteration #623 is - -0.237529
Objective value at iteration #624 is - -0.237749
Objective value at i

Objective value at iteration #787 is - -0.255058
Objective value at iteration #788 is - -0.255143
Objective value at iteration #789 is - -0.255175
Objective value at iteration #790 is - -0.255271
Objective value at iteration #791 is - -0.255307
Objective value at iteration #792 is - -0.255414
Objective value at iteration #793 is - -0.255454
Objective value at iteration #794 is - -0.255575
Objective value at iteration #795 is - -0.25562
Objective value at iteration #796 is - -0.255756
Objective value at iteration #797 is - -0.255807
Objective value at iteration #798 is - -0.255961
Objective value at iteration #799 is - -0.256018
Objective value at iteration #800 is - -0.256193
Objective value at iteration #801 is - -0.256257
Objective value at iteration #802 is - -0.256461
Objective value at iteration #803 is - -0.256533
Objective value at iteration #804 is - -0.25683
Objective value at iteration #805 is - -0.256923
Objective value at iteration #806 is - -0.256957
Objective value at ite

Objective value at iteration #1039 is - -0.276798
Objective value at iteration #1040 is - -0.276886
Objective value at iteration #1041 is - -0.276919
Objective value at iteration #1042 is - -0.277017
Objective value at iteration #1043 is - -0.277054
Objective value at iteration #1044 is - -0.277165
Objective value at iteration #1045 is - -0.277206
Objective value at iteration #1046 is - -0.277331
Objective value at iteration #1047 is - -0.277378
Objective value at iteration #1048 is - -0.27752
Objective value at iteration #1049 is - -0.277573
Objective value at iteration #1050 is - -0.27774
Objective value at iteration #1051 is - -0.277799
Objective value at iteration #1052 is - -0.277821
Objective value at iteration #1053 is - -0.277887
Objective value at iteration #1054 is - -0.277912
Objective value at iteration #1055 is - -0.277986
Objective value at iteration #1056 is - -0.278014
Objective value at iteration #1057 is - -0.278097
Objective value at iteration #1058 is - -0.278128
Ob

Objective value at iteration #1263 is - -0.292277
Objective value at iteration #1264 is - -0.292296
Objective value at iteration #1265 is - -0.29235
Objective value at iteration #1266 is - -0.292371
Objective value at iteration #1267 is - -0.292432
Objective value at iteration #1268 is - -0.292455
Objective value at iteration #1269 is - -0.292524
Objective value at iteration #1270 is - -0.29255
Objective value at iteration #1271 is - -0.292627
Objective value at iteration #1272 is - -0.292657
Objective value at iteration #1273 is - -0.292744
Objective value at iteration #1274 is - -0.292776
Objective value at iteration #1275 is - -0.292874
Objective value at iteration #1276 is - -0.292911
Objective value at iteration #1277 is - -0.293021
Objective value at iteration #1278 is - -0.293062
Objective value at iteration #1279 is - -0.293189
Objective value at iteration #1280 is - -0.293235
Objective value at iteration #1281 is - -0.293267
Objective value at iteration #1282 is - -0.293321
Ob

Objective value at iteration #1442 is - -0.301921
Objective value at iteration #1443 is - -0.301972
Objective value at iteration #1444 is - -0.301991
Objective value at iteration #1445 is - -0.302049
Objective value at iteration #1446 is - -0.302071
Objective value at iteration #1447 is - -0.302136
Objective value at iteration #1448 is - -0.30216
Objective value at iteration #1449 is - -0.302233
Objective value at iteration #1450 is - -0.30226
Objective value at iteration #1451 is - -0.302342
Objective value at iteration #1452 is - -0.302373
Objective value at iteration #1453 is - -0.302465
Objective value at iteration #1454 is - -0.3025
Objective value at iteration #1455 is - -0.302604
Objective value at iteration #1456 is - -0.302643
Objective value at iteration #1457 is - -0.302771
Objective value at iteration #1458 is - -0.302814
Objective value at iteration #1459 is - -0.302831
Objective value at iteration #1460 is - -0.302879
Objective value at iteration #1461 is - -0.302897
Obje

Objective value at iteration #1662 is - -0.312136
Objective value at iteration #1663 is - -0.312205
Objective value at iteration #1664 is - -0.312232
Objective value at iteration #1665 is - -0.31231
Objective value at iteration #1666 is - -0.31234
Objective value at iteration #1667 is - -0.312429
Objective value at iteration #1668 is - -0.312462
Objective value at iteration #1669 is - -0.312573
Objective value at iteration #1670 is - -0.31261
Objective value at iteration #1671 is - -0.312624
Objective value at iteration #1672 is - -0.312665
Objective value at iteration #1673 is - -0.312681
Objective value at iteration #1674 is - -0.312727
Objective value at iteration #1675 is - -0.312745
Objective value at iteration #1676 is - -0.312797
Objective value at iteration #1677 is - -0.312817
Objective value at iteration #1678 is - -0.312875
Objective value at iteration #1679 is - -0.312897
Objective value at iteration #1680 is - -0.312963
Objective value at iteration #1681 is - -0.312988
Obj

Objective value at iteration #1834 is - -0.31915
Objective value at iteration #1835 is - -0.319197
Objective value at iteration #1836 is - -0.319215
Objective value at iteration #1837 is - -0.319267
Objective value at iteration #1838 is - -0.319287
Objective value at iteration #1839 is - -0.319345
Objective value at iteration #1840 is - -0.319368
Objective value at iteration #1841 is - -0.319434
Objective value at iteration #1842 is - -0.319459
Objective value at iteration #1843 is - -0.319534
Objective value at iteration #1844 is - -0.319562
Objective value at iteration #1845 is - -0.319649
Objective value at iteration #1846 is - -0.31968
Objective value at iteration #1847 is - -0.319692
Objective value at iteration #1848 is - -0.319727
Objective value at iteration #1849 is - -0.319741
Objective value at iteration #1850 is - -0.31978
Objective value at iteration #1851 is - -0.319795
Objective value at iteration #1852 is - -0.31984
Objective value at iteration #1853 is - -0.319856
Obje

Objective value at iteration #2058 is - -0.327145
Objective value at iteration #2059 is - -0.327155
Objective value at iteration #2060 is - -0.327186
Objective value at iteration #2061 is - -0.327197
Objective value at iteration #2062 is - -0.327232
Objective value at iteration #2063 is - -0.327245
Objective value at iteration #2064 is - -0.327284
Objective value at iteration #2065 is - -0.327298
Objective value at iteration #2066 is - -0.327342
Objective value at iteration #2067 is - -0.327358
Objective value at iteration #2068 is - -0.327407
Objective value at iteration #2069 is - -0.327426
Objective value at iteration #2070 is - -0.32748
Objective value at iteration #2071 is - -0.327501
Objective value at iteration #2072 is - -0.327563
Objective value at iteration #2073 is - -0.327586
Objective value at iteration #2074 is - -0.327657
Objective value at iteration #2075 is - -0.327683
Objective value at iteration #2076 is - -0.327693
Objective value at iteration #2077 is - -0.327722
O

Objective value at iteration #2290 is - -0.334223
Objective value at iteration #2291 is - -0.334252
Objective value at iteration #2292 is - -0.334263
Objective value at iteration #2293 is - -0.334295
Objective value at iteration #2294 is - -0.334308
Objective value at iteration #2295 is - -0.334344
Objective value at iteration #2296 is - -0.334358
Objective value at iteration #2297 is - -0.334399
Objective value at iteration #2298 is - -0.334415
Objective value at iteration #2299 is - -0.334461
Objective value at iteration #2300 is - -0.334478
Objective value at iteration #2301 is - -0.33453
Objective value at iteration #2302 is - -0.334549
Objective value at iteration #2303 is - -0.334607
Objective value at iteration #2304 is - -0.334629
Objective value at iteration #2305 is - -0.334637
Objective value at iteration #2306 is - -0.334662
Objective value at iteration #2307 is - -0.334671
Objective value at iteration #2308 is - -0.334699
Objective value at iteration #2309 is - -0.334709
O

Objective value at iteration #2495 is - -0.339501
Objective value at iteration #2496 is - -0.339549
Objective value at iteration #2497 is - -0.339567
Objective value at iteration #2498 is - -0.339622
Objective value at iteration #2499 is - -0.339642
Objective value at iteration #2500 is - -0.33965
Objective value at iteration #2501 is - -0.339673
Objective value at iteration #2502 is - -0.339682
Objective value at iteration #2503 is - -0.339708
Objective value at iteration #2504 is - -0.339717
Objective value at iteration #2505 is - -0.339746
Objective value at iteration #2506 is - -0.339757
Objective value at iteration #2507 is - -0.33979
Objective value at iteration #2508 is - -0.339802
Objective value at iteration #2509 is - -0.339839
Objective value at iteration #2510 is - -0.339853
Objective value at iteration #2511 is - -0.339894
Objective value at iteration #2512 is - -0.339909
Objective value at iteration #2513 is - -0.339955
Objective value at iteration #2514 is - -0.339973
Ob

Objective value at iteration #2703 is - -0.344307
Objective value at iteration #2704 is - -0.344342
Objective value at iteration #2705 is - -0.344355
Objective value at iteration #2706 is - -0.344394
Objective value at iteration #2707 is - -0.344408
Objective value at iteration #2708 is - -0.344452
Objective value at iteration #2709 is - -0.344468
Objective value at iteration #2710 is - -0.344517
Objective value at iteration #2711 is - -0.344535
Objective value at iteration #2712 is - -0.344542
Objective value at iteration #2713 is - -0.344563
Objective value at iteration #2714 is - -0.344571
Objective value at iteration #2715 is - -0.344594
Objective value at iteration #2716 is - -0.344603
Objective value at iteration #2717 is - -0.344629
Objective value at iteration #2718 is - -0.344639
Objective value at iteration #2719 is - -0.344668
Objective value at iteration #2720 is - -0.344679
Objective value at iteration #2721 is - -0.344712
Objective value at iteration #2722 is - -0.344725


Objective value at iteration #2904 is - -0.348545
Objective value at iteration #2905 is - -0.348592
Objective value at iteration #2906 is - -0.34861
Objective value at iteration #2907 is - -0.348616
Objective value at iteration #2908 is - -0.348636
Objective value at iteration #2909 is - -0.348643
Objective value at iteration #2910 is - -0.348665
Objective value at iteration #2911 is - -0.348674
Objective value at iteration #2912 is - -0.348699
Objective value at iteration #2913 is - -0.348708
Objective value at iteration #2914 is - -0.348736
Objective value at iteration #2915 is - -0.348746
Objective value at iteration #2916 is - -0.348778
Objective value at iteration #2917 is - -0.348789
Objective value at iteration #2918 is - -0.348825
Objective value at iteration #2919 is - -0.348838
Objective value at iteration #2920 is - -0.348877
Objective value at iteration #2921 is - -0.348892
Objective value at iteration #2922 is - -0.348936
Objective value at iteration #2923 is - -0.348953
O

In [None]:
nlopt.algorithm_name(nlopt.AUGLAG)

In [None]:
parameters = [{'C': [10**(e) for e in range(-6,-1)]}]

clf = GridSearchCV(MLC_toppush(), parameters, cv=5, n_jobs=1)
clf.fit(X_train[:100], Y_train[:100])

print("\nBest parameters set found on development set:")
print(clf.best_params_)

In [None]:
for mean, std, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], \
                             clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
preds_train = clf.decision_function(X_train)
preds_test  = clf.decision_function(X_test)

In [None]:
print('Training set:')
printEvaluation(Y_train, preds_train)
print()
print('Test set:')
printEvaluation(Y_test, preds_test)

## Result analysis

In [None]:
precisions_train = [avgPrecision(Y_train, preds_train, k) for k in range(1, nLabels+1)]
precisions_test  = [avgPrecision(Y_test,  preds_test,  k) for k in range(1, nLabels+1)]

In [None]:
precisionK_train = avgPrecisionK(Y_train, preds_train)
precisionK_test  = avgPrecisionK(Y_test,  preds_test)

In [None]:
plt.figure(figsize=[10,5])
plt.plot(precisions_train, ls='--', c='r', label='Train')
plt.plot(precisions_test,  ls='-',  c='g', label='Test')
plt.plot([precisionK_train for k in range(nLabels)], ls='-', c='r', label='Train, Precision@K')
plt.plot([precisionK_test  for k in range(nLabels)], ls='-', c='g', label='Test, Precision@K')
plt.xticks(np.arange(nLabels), np.arange(1,nLabels+1))
plt.xlabel('k')
plt.ylabel('Precision@k')
plt.legend(loc='best')
plt.title('MLC w. Top-push Loss on ' + dataset_name + ' dataset')
plt.savefig(dataset_name + '_tp.svg')