# A simple example of  multilable learning

In [34]:
%matplotlib inline

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn as sk
import cython
from scipy.io import arff
from scipy.optimize import minimize

from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_dir = 'data'
yeast_ftrain = os.path.join(data_dir, 'yeast/yeast-train.arff')
yeast_ftest  = os.path.join(data_dir, 'yeast/yeast-test.arff')

## Data loading

Load yeast dataset.

In [3]:
data_train, meta_train = arff.loadarff(yeast_ftrain)

In [4]:
data_test, meta_test = arff.loadarff(yeast_ftest)

In [5]:
type(data_train)

numpy.ndarray

Features

In [6]:
np.array(list(data_train[0])[:-14], dtype=np.float).shape

(103,)

Labels

In [7]:
np.array(list(data_train[0])[-14:], dtype=np.int).shape

(14,)

In [8]:
nFeatures = 103
nLabels = 14

## Data analysis

In [9]:
len(data_train)

1500

In [10]:
len(data_test)

917

## Training & Testing

Train a logistic regression model for each label.

In [11]:
def gen_training_set(label_ix, data):
    """
        Create the labelled dataset for a given label index
        
        Input:
            - label_ix: label index, number in { 0, ..., # labels }
            - data: original data with features + labels
            
        Output:
            - (Feature, Label) pair (X, y)
              X comprises the features for each example
              y comprises the labels of the corresponding example
    """

    assert(label_ix >= 0)
    assert(label_ix < nLabels)

    N = len(data)
    d = nFeatures

    X = np.zeros((N, d), dtype = np.float)
    y = np.zeros(N, dtype = np.int)
       
    for i in range(N):
        X[i, :] = list(data[i])[:-14]
        y[i]    = list(data[i])[-14:][label_ix]

    return X, y

In [12]:
classifiers = [LogisticRegression(class_weight='balanced') for i in range(nLabels)]

In [13]:
allPreds  = [ ]
allTruths = [ ]
coefMat = [ ]
labelIndices = [ ]

for label_ix in range(nLabels):
    X_train, y_train = gen_training_set(label_ix, data = data_train)
    X_test, y_test   = gen_training_set(label_ix, data = data_test)
        
    # by fixing random seed, the same playlists will be in the test set each time
    #X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, \
    #                                                                       test_size = 0.33, \
    #                                                                       random_state = 31)    
    
    allTruths.append(y_test) 
    
    assert( (not np.all(y_train == 0)) or (not np.all(y_train == 1)) )
    
    #if np.all(y_train == 0) or np.all(y_train == 1): continue
    
    #if np.all(y_train == 0):
    #    allPreds.append(np.zeros(X_test.shape[0], dtype=np.int))
    #    continue
    #elif np.all(y_train == 1):
    #    allPreds.append(np.ones(X_test.shape[0], dtype=np.int))
    #    continue

    classifiers[label_ix].fit(X_train, y_train)
    allPreds.append(classifiers[label_ix].decision_function(X_test))

    coefMat.append(classifiers[label_ix].coef_.reshape(-1))
    #labelIndices.append(label_ix)
    #print(classifiers[label_ix].coef_)
    #print(classifiers[label_ix].intercept_)

In [14]:
allPreds = np.array(allPreds).T
allTruths = np.array(allTruths).T

print(allPreds.shape)
print(allTruths.shape)

(917, 14)
(917, 14)


In [15]:
allPreds[0]

array([-0.15997494, -0.51477752, -1.18553419,  0.21972233,  0.96896183,
        0.00853919,  0.32721911,  0.37575265, -0.80121226, -2.4663948 ,
       -1.79701745,  0.92752986,  0.89316058, -2.04288556])

## Evaluation

Loss between a ground truth and a prediction.

In [16]:
def loss(truth, pred, lossType='Hamming'):
    """
    compute a few losses given ground truth and prediction:
    subset 0-1, Hamming, ranking, and Precision@K where K = # positive labels.
    """
    assert(len(truth) == len(pred))
    L = len(truth)
    nPos = np.sum(truth)
    
    if lossType == 'Subset01':
        return 1 - int(np.all(truth == pred))
    
    elif lossType == 'Hamming':
        return np.sum(truth != pred) / L
    
    elif lossType == 'Ranking':
        loss = 0
        for i in range(L-1):
            for j in range(i+1, L):
                if truth[i] > truth[j]:
                    if pred[i] < pred[j]: 
                        loss += 1
                    if pred[i] == pred[j]:
                        loss += 0.5
        #return loss / (nPos * (L-nPos))
        return loss
        
    elif lossType == 'Precision@K':
        loss = 0
        for i in range(L):
            if truth[i] == 1 and pred[i] == 0:
                loss += 1
        return loss / nPos
    
    else:
        assert(False)

Compute losses.

In [17]:
for lossType in ['Subset01', 'Hamming', 'Ranking', 'Precision@K']:
    losses = [ ]
    for i in range(allPreds.shape[0]):
        pred  = np.array((allPreds[i, :] > 0.5), dtype=np.int)
        truth = allTruths[i, :]
        losses.append(loss(truth, pred, lossType))
        
        #print(allPreds[i])
        #print(pred)
        #print(truth)
        #break
    
    print('Average %s Loss: %1.4f' % (lossType, np.mean(losses)))
    #plt.hist(aucs, bins = 10);

Average Subset01 Loss: 0.9182
Average Hamming Loss: 0.2874
Average Ranking Loss: 6.9967
Average Precision@K Loss: 0.5956


Compute average precision.

## Result analysis

Coefficient matrix `(#Genres, #Songs)`.

In [18]:
coefMat = np.array(coefMat).T

In [19]:
coefMat.shape

(103, 14)

In [20]:
#sns.heatmap(coefMat[:, :30])

## Rank loss

In [35]:
%load_ext Cython

In [36]:
%%cython -a
import numpy as np
cimport numpy as np

cpdef obj_rank(w, X, y):
    assert(len(y) == X.shape[0])
    assert(len(w) == X.shape[1])

    cdef int nPos, nNeg, i, j
    cdef double J, term, denom
    nPos = np.sum(y)      # num of positive examples
    nNeg = len(y) - nPos  # num of negative examples
    
    ixPos = np.nonzero(y)[0].tolist()                    # indices positive examples
    ixNeg = sorted(set(np.arange(len(y))) - set(ixPos))  # indices negative examples
    
    J = 0.0  # cost
    g = np.zeros_like(w)  # gradient
    
    for i in ixPos:
        for j in ixNeg:
            dx = X[i, :] - X[j, :]
            term = np.dot(w, dx)
            J += np.log1p(np.exp(term))
            g = g + dx / (np.expm1(-term) + 2)    
    
    denom = nPos * nNeg
    J = 0.5 * np.dot(w, w) + J / denom
    g = w + g / denom
    
    return (J, g)

In [37]:
def decision(X_test, w):
    assert(len(w) == X_test.shape[1])
    
    scores = np.dot(X_test, w)
    probs = 1 / (2 + np.expm1(-scores))
    
    return probs

In [38]:
params = [ ]
allPreds  = [ ]
allTruths = [ ]

In [39]:
for label_ix in range(nLabels):
    #sys.stdout.write('\r%d / %d' % (label_ix + 1, nLabels))
    #sys.stdout.flush()
    print('\r%d / %d ' % (label_ix + 1, nLabels))
    
    X_train, y_train = gen_training_set(label_ix, data = data_train)
    X_test, y_test   = gen_training_set(label_ix, data = data_test)
    
    allTruths.append(y_test) 
    
    assert( (not np.all(y_train == 0)) or (not np.all(y_train == 1)) )
    
    #if np.all(y_train == 0) or np.all(y_train == 1): continue
    
    #if np.all(y_train == 0):
    #    allPreds.append(np.zeros(X_test.shape[0], dtype=np.int))
    #    continue
    #elif np.all(y_train == 1):
    #    allPreds.append(np.ones(X_test.shape[0], dtype=np.int))
    #    continue
    
    w = np.random.rand(X_train.shape[1])  # initial guess
    opt_method = 'BFGS' #'Newton-CG' 
    options = {'disp': True}
    opt = minimize(obj_rank, w, args=(X_train, y_train), method=opt_method, jac=True, options=options)
    if opt.success == True:
        w = opt.x
        params.append(w)
        allPreds.append(np.dot(X_test, w))
    else:
        sys.stderr.write('Optimisation failed, label_ix=%d\n' % label_ix)
        w = np.zeros(X_train.shape[1])
        params.append(w)
        allPreds.append(decision(X_test, w))

1 / 14Optimization terminated successfully.
         Current function value: 0.687650
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
2 / 14Optimization terminated successfully.
         Current function value: 0.690264
         Iterations: 4
         Function evaluations: 5
         Gradient evaluations: 5
3 / 14Optimization terminated successfully.
         Current function value: 0.687208
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
4 / 14Optimization terminated successfully.
         Current function value: 0.684365
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
5 / 14Optimization terminated successfully.
         Current function value: 0.687649
         Iterations: 5
         Function evaluations: 6
         Gradient evaluations: 6
6 / 14Optimization terminated successfully.
         Current function value: 0.689686
         Iterations: 5
         Functio

In [28]:
allPreds = np.array(allPreds).T
allTruths = np.array(allTruths).T

print(allPreds.shape)
print(allTruths.shape)

(917, 14)
(917, 14)


In [31]:
allPreds[0]

array([ 0.01041459,  0.02343311,  0.01492048, -0.04255027, -0.03156382,
        0.02135366,  0.01013724,  0.01079779,  0.02023349,  0.05559079,
        0.03785101, -0.02253766, -0.02141111,  0.00042809])

In [32]:
for lossType in ['Subset01', 'Hamming', 'Ranking', 'Precision@K']:
    losses = [ ]
    for i in range(allPreds.shape[0]):
        pred  = np.array((allPreds[i, :] > 0.5), dtype=np.int)
        truth = allTruths[i, :]
        losses.append(loss(truth, pred, lossType))
        
        #print(allPreds[i])
        #print(pred)
        #print(truth)
        #break
    
    print('Average %s Loss: %1.4f' % (lossType, np.mean(losses)))
    #plt.hist(aucs, bins = 10);

Average Subset01 Loss: 1.0000
Average Hamming Loss: 0.3037
Average Ranking Loss: 9.9280
Average Precision@K Loss: 1.0000
