# A simple example of  multilable learning

In [1]:
%matplotlib inline

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn as sk
import cython
from scipy.io import arff
from scipy.optimize import minimize

from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_dir = 'data'
yeast_ftrain = os.path.join(data_dir, 'yeast/yeast-train.arff')
yeast_ftest  = os.path.join(data_dir, 'yeast/yeast-test.arff')

## Data loading

Load yeast dataset.

In [3]:
data_train, meta_train = arff.loadarff(yeast_ftrain)

In [4]:
data_test, meta_test = arff.loadarff(yeast_ftest)

In [5]:
type(data_train)

numpy.ndarray

Features

In [6]:
np.array(list(data_train[0])[:-14], dtype=np.float).shape

(103,)

Labels

In [7]:
np.array(list(data_train[0])[-14:], dtype=np.int).shape

(14,)

In [8]:
nFeatures = 103
nLabels = 14

## Data analysis

In [9]:
len(data_train)

1500

In [10]:
len(data_test)

917

## Dataset creation

Train a logistic regression model for each label.

In [11]:
def gen_training_set(label_ix, data):
    """
        Create the labelled dataset for a given label index
        
        Input:
            - label_ix: label index, number in { 0, ..., # labels }
            - data: original data with features + labels
            
        Output:
            - (Feature, Label) pair (X, y)
              X comprises the features for each example
              y comprises the labels of the corresponding example
    """

    assert(label_ix >= 0)
    assert(label_ix < nLabels)

    N = len(data)
    d = nFeatures

    X = np.zeros((N, d), dtype = np.float)
    y = np.zeros(N, dtype = np.int)
       
    for i in range(N):
        X[i, :] = list(data[i])[:-14]
        y[i]    = list(data[i])[-14:][label_ix]

    return X, y

## Evaluation

Loss between a ground truth and a prediction.

In [12]:
def evalPred(truth, pred, lossType = 'Hamming'):
    """
        Compute loss given ground truth and prediction
        
        Input:
            - truth:    binary array of true labels
            - pred:     real-valued array of predictions
            - lossType: can be subset 0-1, Hamming, ranking, and Precision@K where K = # positive labels.
    """

    assert(len(truth) == len(pred))
    L = len(truth)
    nPos = np.sum(truth)
    
    predBin = np.array((pred > 0), dtype=np.int)
    
    if lossType == 'Subset01':
        return 1 - int(np.all(truth == predBin))
    
    elif lossType == 'Hamming':
        return np.sum(truth != predBin) / L
    
    elif lossType == 'Ranking':
        loss = 0
        for i in range(L-1):
            for j in range(i+1, L):
                if truth[i] > truth[j]:
                    if pred[i] < pred[j]: 
                        loss += 1
                    if pred[i] == pred[j]:
                        loss += 0.5
        #return loss / (nPos * (L-nPos))
        return loss
        
    elif lossType == 'Precision@K':
        # sorted indices of the labels most likely to be +'ve
        idx = np.argsort(pred)[::-1]
        
        # true labels according to the sorted order
        y = truth[idx]
        
        # fraction of +'ves in the top K predictions
        return np.mean(y[:nPos])
    
    else:
        assert(False)

In [13]:
def printEvaluation(allPreds, allTruths):

    for lossType in ['Subset01', 'Hamming', 'Ranking', 'Precision@K']:
        losses = [ ]
        for i in range(allPreds.shape[0]):
            pred  = allPreds[i, :]
            truth = allTruths[i, :]
            losses.append(evalPred(truth, pred, lossType))

            #print(allPreds[i])
            #print(pred)
            #print(truth)
            #break

        print('%24s: %1.4f' % ('Average %s Loss' % lossType, np.mean(losses)))
        #plt.hist(aucs, bins = 10);

## Binary relevance baseline

In [14]:
classifiers = [ LogisticRegression(class_weight = 'balanced', C = 10**0) for i in range(nLabels) ]

In [15]:
allPreds  = [ ]
allTruths = [ ]
coefMat = [ ]
labelIndices = [ ]

for label_ix in range(nLabels):
    X_train, y_train = gen_training_set(label_ix, data = data_train)
    X_test, y_test   = gen_training_set(label_ix, data = data_test)
    
    allTruths.append(y_test) 
    
    assert( (not np.all(y_train == 0)) or (not np.all(y_train == 1)) )

    classifiers[label_ix].fit(X_train, y_train)
    allPreds.append(classifiers[label_ix].decision_function(X_test))

    coefMat.append(classifiers[label_ix].coef_.reshape(-1))
    #labelIndices.append(label_ix)
    #print(classifiers[label_ix].coef_)
    #print(classifiers[label_ix].intercept_)

In [16]:
allPreds  = np.array(allPreds).T
allTruths = np.array(allTruths).T

print(allPreds.shape)
print(allTruths.shape)

(917, 14)
(917, 14)


In [17]:
allPreds[0]

array([-0.15997494, -0.51477752, -1.18553419,  0.21972233,  0.96896183,
        0.00853919,  0.32721911,  0.37575265, -0.80121226, -2.4663948 ,
       -1.79701745,  0.92752986,  0.89316058, -2.04288556])

In [18]:
printEvaluation(allPreds, allTruths)

   Average Subset01 Loss: 0.9302
    Average Hamming Loss: 0.3325
    Average Ranking Loss: 5.2203
Average Precision@K Loss: 0.5149


## Result analysis

Coefficient matrix `(#Genres, #Songs)`.

In [19]:
coefMat = np.array(coefMat).T

In [20]:
coefMat.shape

(103, 14)

In [21]:
#sns.heatmap(coefMat[:, :30])

## Binary relevance with bipartite ranking

In [22]:
#%load_ext Cython

In [23]:
#%%cython -a

import numpy as np
#cimport numpy as np

#cpdef obj_rank(w, X, y):

def obj_rank(w, X, y):
    """
        Pairwise ranking objective
        
        Input:
            - w: current weight vector
            - X: feature matrix, N x D
            - y: label vector,   N x 1
    """
    assert(len(y) == X.shape[0])
    assert(len(w) == X.shape[1])

    #cdef int nPos, nNeg, i, j
    #cdef double J, term, denom
    nPos = np.sum(y)      # num of positive examples
    nNeg = len(y) - nPos  # num of negative examples
    
    ixPos = np.nonzero(y)[0].tolist()                    # indices positive examples
    ixNeg = list(set(np.arange(len(y))) - set(ixPos))    # indices negative examples
    
    J = 0.0  # cost
    g = np.zeros_like(w)  # gradient    

    scorePos = X[ixPos, :].dot(w)[:,np.newaxis] # nPos x 1
    scoreNeg = X[ixNeg, :].dot(w)[:,np.newaxis] # nNeg x 1
    scoreDif = scorePos - scoreNeg.T            # nPos x nNeg
    #J = np.mean(np.log(1 + np.exp(-scoreDif)))
    J = 0.5 * np.dot(w, w) + np.mean(np.log1p(np.exp(-scoreDif)))
    
    A = -1/(1 + np.exp(scoreDif))

    T1 = X[ixPos, :].T.dot(A.sum(axis = 1))
    T2 = X[ixNeg, :].T.dot(A.sum(axis = 0))
    g  = w + 1/(nPos * nNeg) * (T1 - T2)
    
    return (J, g)

In [24]:
params    = [ ]
allPreds  = [ ]
allTruths = [ ]

for label_ix in range(nLabels):
    #sys.stdout.write('\r%d / %d' % (label_ix + 1, nLabels))
    #sys.stdout.flush()
    print('\r%d / %d ' % (label_ix + 1, nLabels))
    
    X_train, y_train = gen_training_set(label_ix, data = data_train)
    X_test, y_test   = gen_training_set(label_ix, data = data_test)
    
    allTruths.append(y_test) 
    
    assert( (not np.all(y_train == 0)) or (not np.all(y_train == 1)) )
        
    opt_method = 'BFGS' #'Newton-CG' 
    #opt_method = 'nelder-mead'
    options = {'disp': True}
    
    w = np.random.rand(X_train.shape[1])  # initial guess
    opt = minimize(obj_rank, w, args=(X_train, y_train), method=opt_method, jac=True, options=options)
    
    if opt.success == True:
        w = opt.x
        params.append(w)
        allPreds.append(np.dot(X_test, w))
    else:
        sys.stderr.write('Optimisation failed, label_ix=%d\n' % label_ix)
        w = np.zeros(X_train.shape[1])
        params.append(w)
        allPreds.append(np.dot(X_test, w))

1 / 14 
Optimization terminated successfully.
         Current function value: 0.687650
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
2 / 14 
Optimization terminated successfully.
         Current function value: 0.690264
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
3 / 14 
Optimization terminated successfully.
         Current function value: 0.687208
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
4 / 14 
Optimization terminated successfully.
         Current function value: 0.684365
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
5 / 14 
Optimization terminated successfully.
         Current function value: 0.687649
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
6 / 14 
Optimization terminated successfully.
         Current function value: 0.689686
         Iterations: 5
    

In [25]:
allPreds = np.array(allPreds).T
allTruths = np.array(allTruths).T

print(allPreds.shape)
print(allTruths.shape)

(917, 14)
(917, 14)


In [26]:
allPreds[0]

array([-0.01041515, -0.02343381, -0.0149204 ,  0.0425496 ,  0.03156229,
       -0.02135378, -0.01013787, -0.01079791, -0.02023371, -0.05559105,
       -0.03785112,  0.02253723,  0.02141053, -0.00042832])

In [27]:
printEvaluation(allPreds, allTruths)

   Average Subset01 Loss: 0.9335
    Average Hamming Loss: 0.4301
    Average Ranking Loss: 7.2246
Average Precision@K Loss: 0.4400


## Ranking loss optimisation

In [22]:
#%load_ext Cython

In [23]:
#%%cython -a

import numpy as np
#cimport numpy as np

#cpdef obj_rank(w, X, y):

def obj_ranking_loss(w, X, y):
    """
        Pairwise ranking objective
        
        Input:
            - w: current weight matrix, flattened D x L
            - X: feature matrix, N x D
            - y: label matrix,   N x L
    """
    assert(len(y) == X.shape[0])
    assert(w.shape[0] == X.shape[1])
    assert(w.shape[1] == Y.shape[1])

    #cdef int nPos, nNeg, i, j
    #cdef double J, term, denom
    nPos = np.sum(y)      # num of positive examples
    nNeg = len(y) - nPos  # num of negative examples
    
    ixPos = np.nonzero(y)[0].tolist()                    # indices positive examples
    ixNeg = list(set(np.arange(len(y))) - set(ixPos))    # indices negative examples
    
    J = 0.0  # cost
    g = np.zeros_like(w)  # gradient    

    scorePos = X[ixPos, :].dot(w)[:,np.newaxis] # nPos x 1
    scoreNeg = X[ixNeg, :].dot(w)[:,np.newaxis] # nNeg x 1
    scoreDif = scorePos - scoreNeg.T            # nPos x nNeg
    #J = np.mean(np.log(1 + np.exp(-scoreDif)))
    J = 0.5 * np.dot(w, w) + np.mean(np.log1p(np.exp(-scoreDif)))
    
    A = -1/(1 + np.exp(scoreDif))

    T1 = X[ixPos, :].T.dot(A.sum(axis = 1))
    T2 = X[ixNeg, :].T.dot(A.sum(axis = 0))
    g  = w + 1/(nPos * nNeg) * (T1 - T2)
    
    return (J, g)

In [24]:
params    = [ ]
allPreds  = [ ]
allTruths = [ ]

for label_ix in range(nLabels):
    #sys.stdout.write('\r%d / %d' % (label_ix + 1, nLabels))
    #sys.stdout.flush()
    print('\r%d / %d ' % (label_ix + 1, nLabels))
    
    X_train, y_train = gen_training_set(label_ix, data = data_train)
    X_test, y_test   = gen_training_set(label_ix, data = data_test)
    
    allTruths.append(y_test) 
    
    assert( (not np.all(y_train == 0)) or (not np.all(y_train == 1)) )
        
    opt_method = 'BFGS' #'Newton-CG' 
    #opt_method = 'nelder-mead'
    options = {'disp': True}
    
    w = np.random.rand(X_train.shape[1])  # initial guess
    opt = minimize(obj_rank, w, args=(X_train, y_train), method=opt_method, jac=True, options=options)
    
    if opt.success == True:
        w = opt.x
        params.append(w)
        allPreds.append(np.dot(X_test, w))
    else:
        sys.stderr.write('Optimisation failed, label_ix=%d\n' % label_ix)
        w = np.zeros(X_train.shape[1])
        params.append(w)
        allPreds.append(np.dot(X_test, w))

1 / 14 
Optimization terminated successfully.
         Current function value: 0.687650
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
2 / 14 
Optimization terminated successfully.
         Current function value: 0.690264
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
3 / 14 
Optimization terminated successfully.
         Current function value: 0.687208
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
4 / 14 
Optimization terminated successfully.
         Current function value: 0.684365
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
5 / 14 
Optimization terminated successfully.
         Current function value: 0.687649
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
6 / 14 
Optimization terminated successfully.
         Current function value: 0.689686
         Iterations: 5
    

In [25]:
allPreds = np.array(allPreds).T
allTruths = np.array(allTruths).T

print(allPreds.shape)
print(allTruths.shape)

(917, 14)
(917, 14)


In [26]:
allPreds[0]

array([-0.01041515, -0.02343381, -0.0149204 ,  0.0425496 ,  0.03156229,
       -0.02135378, -0.01013787, -0.01079791, -0.02023371, -0.05559105,
       -0.03785112,  0.02253723,  0.02141053, -0.00042832])

In [27]:
printEvaluation(allPreds, allTruths)

   Average Subset01 Loss: 0.9335
    Average Hamming Loss: 0.4301
    Average Ranking Loss: 7.2246
Average Precision@K Loss: 0.4400
