### Multi-label classification on FMA

Recast genre classification as a multilabel problem

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp
import IPython.display as ipd

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from sklearn import metrics

import utils
import top_genres

RANDOM_STATE = 53

### Load features and tracks

* enrich tracks with top_genre1, top_genre2

note: genre enrichment takes a minute

In [2]:
(features, tracks) = utils.load_features()
genres =  utils.load_genres()

# enrich tracks with top_genres
top_genres.add_top_genres(tracks, genres)
tracks.shape

(106574, 57)

### Create a multilabel dataset using top_genre1 and top_genre2

* drop easy listening and spoken (small support, oddball categories)
* choose tracks with top_genre_count <= 3
* Use entire dataset; will be imbalanced

In [3]:
prune = ~(tracks[('track','top_genre1')].isin(['Spoken','Easy Listening']) |
    tracks[('track','top_genre2')].isin(['Spoken','Easy Listening']) |
    tracks[('track','top_genre3')].isin(['Spoken','Easy Listening']))

f1 = features[prune]
t1 = tracks[prune]


In [4]:
keep = (t1[('track','top_genre_count')] < 4) & (t1[('track','top_genre_type')].isin(['major','hybrid']))
t1 = t1[keep].copy()
f1 = f1[keep].copy()

# generate label set and append to t1
tops = t1[[('track','top_genre1'),('track','top_genre2')]].values.tolist()
tops = [[j for j in i if j] for i in tops]
t1[('track'),('top_genre_list')] = tops


### binarize label set

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

X = f1

y = t1[('track'),('top_genre_list')]
mlb = MultiLabelBinarizer()
mlb.fit(y)
y = mlb.transform(y)
y.shape

(87151, 14)

### Examine the frequency of each label in the multi-label set

* we see that it is pretty unbalanced - Rock/Experimental/Jazz > 25,000, others < 1,000

In [6]:
from collections import defaultdict

def count_labels(y):
    label_counts = defaultdict(int)
    for blabels in y:
        for i in range(len(blabels)):
            if blabels[i]:
                label_counts[mlb.classes_[i]] += 1

    return dict(label_counts)

count_labels(y)

{'Hip-Hop': 6810,
 'Pop': 9076,
 'Folk': 8272,
 'Rock': 25554,
 'Experimental': 28001,
 'Jazz': 2359,
 'Electronic': 26084,
 'International': 3623,
 'Instrumental': 9079,
 'Blues': 689,
 'Soul-RnB': 926,
 'Old-Time / Historic': 792,
 'Classical': 2695,
 'Country': 1191}

### Build train/test datasets

* without under/oversampling

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=RANDOM_STATE,
                                                    shuffle=True)

scaler = MinMaxScaler(copy=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(y_train.shape, y_test.shape)
print(count_labels(y_train))
print(count_labels(y_test))


(69720, 14) (17431, 14)
{'Folk': 6659, 'Pop': 7257, 'Rock': 20441, 'Experimental': 22428, 'Electronic': 20817, 'Classical': 2158, 'Jazz': 1881, 'International': 2891, 'Instrumental': 7282, 'Hip-Hop': 5460, 'Country': 960, 'Soul-RnB': 727, 'Blues': 550, 'Old-Time / Historic': 631}
{'Experimental': 5573, 'Electronic': 5267, 'Instrumental': 1797, 'Hip-Hop': 1350, 'Classical': 537, 'Rock': 5113, 'Folk': 1613, 'Jazz': 478, 'Pop': 1819, 'Blues': 139, 'International': 732, 'Soul-RnB': 199, 'Country': 231, 'Old-Time / Historic': 161}


### Train MLPClassifier with defaults

NOTE: will take a few minutes

In [8]:
cl1 = MLPClassifier(max_iter=1000, random_state=RANDOM_STATE)
cl1.fit(X_train, y_train)

MLPClassifier(max_iter=1000, random_state=53)

### Examine what prediction looks like

* we can examine prediction probabilities - probability of each genre
* since it's a multilabel problem, MLPClassifier automatically converts it a multilabel prediction for us

In [9]:
yp_pred = cl1.predict_proba(X_test)
yp_pred[0:1]

array([[1.32311224e-04, 6.12750287e-03, 7.08087414e-05, 1.43214400e-01,
        9.17763730e-01, 5.27479993e-02, 2.03602122e-02, 9.52826222e-02,
        1.13800623e-03, 2.94989496e-02, 8.61440099e-07, 4.98122534e-02,
        7.30919992e-02, 5.78823986e-04]])

### it appears MLPClassifier only predicts a positive if > 50%

In [10]:
y_pred = cl1.predict(X_test)
print(yp_pred[18])
print(y_pred[18])

[2.86087203e-03 1.92798975e-05 2.90843967e-02 3.66304377e-03
 2.98989294e-02 2.40836169e-02 1.52075495e-03 5.88636759e-04
 2.04422827e-02 5.93362784e-03 1.46660001e-07 1.72776376e-01
 9.68741172e-01 5.63375679e-03]
[0 0 0 0 0 0 0 0 0 0 0 0 1 0]


### Scoring the classifier

* accuracy will look for an exact match
* this is a harsh metric; no credit for partial match

In [11]:
metrics.accuracy_score(y_test, y_pred,  normalize=True, sample_weight=None)

0.32075038724112215

### Haming loss measures 'distance' between set of predicted labels vs set of true labels

* because our labels are sparse, this is not a good measure.
* for example, predicting all 0 will produce a similar Hamming score

In [12]:
print(metrics.hamming_loss(y_test, y_pred))
z = np.zeros(y_test.shape)
print(metrics.hamming_loss(y_test, z))

0.07571076161518477
0.10248162141340961


### precision / recall from classification report

* on a per label basis.
* recall is poor across the board
* this is probably due to underpredicting. perhaps if we lower the prediction threshold?

In [13]:
my_metrics = metrics.classification_report( y_test, y_pred, target_names=mlb.classes_)
print(my_metrics)

                     precision    recall  f1-score   support

              Blues       1.00      0.01      0.01       139
          Classical       0.85      0.40      0.54       537
            Country       0.00      0.00      0.00       231
         Electronic       0.79      0.40      0.54      5267
       Experimental       0.66      0.64      0.65      5573
               Folk       0.70      0.32      0.44      1613
            Hip-Hop       0.77      0.29      0.42      1350
       Instrumental       0.62      0.12      0.20      1797
      International       0.72      0.18      0.29       732
               Jazz       0.51      0.15      0.24       478
Old-Time / Historic       0.96      0.72      0.82       161
                Pop       0.41      0.03      0.06      1819
               Rock       0.78      0.61      0.69      5113
           Soul-RnB       0.00      0.00      0.00       199

          micro avg       0.72      0.42      0.53     25009
          macro avg   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### investigate/confirm f1 score from classification report

In [14]:
y_prednp = np.array(y_pred)
y_prednp.shape
p4 = y_prednp[:,4]

y_testnp = np.array(y_test)
t4 = y_testnp[:,4]

d = p4 - t4
print(d[0:3])
pos = np.sum(p4 > 0)
neg = np.sum(p4 == 0)
fp = np.sum(d > 0)
fn = np.sum(d < 0)
tp = pos- fp
tn = neg - fn
print(pos, neg, fp, fn, tp, tn)

precision = tp/(tp+fp)
recall = tp/(tp+fn)
print(precision, recall)

[0 0 0]
5450 11981 1869 1992 3581 9989
0.6570642201834862 0.6425623542077875


### Rudimentary attempt to lower prediction probablity threshold

* note: we would prefer this as an option to MLPClassifier so that it would train on these labels.

In [15]:
yp_pred = np.array(cl1.predict_proba(X_test))
for prob in np.linspace(.2,.5, 7):
    y_pred = (yp_pred > prob).astype(int)
    print(prob, metrics.f1_score(y_test, y_pred, average='macro'))

0.2 0.433847240165463
0.25 0.43224623870464424
0.3 0.42694022006081955
0.35 0.41428545212213014
0.4 0.39799155680827986
0.44999999999999996 0.3714522239406773
0.5 0.34960747488672383


In [16]:
y_pred = (yp_pred > .30).astype(int)
my_metrics = metrics.classification_report( y_test, y_pred, target_names=mlb.classes_)
print(my_metrics)

                     precision    recall  f1-score   support

              Blues       0.83      0.04      0.07       139
          Classical       0.63      0.49      0.55       537
            Country       0.35      0.04      0.07       231
         Electronic       0.67      0.66      0.67      5267
       Experimental       0.53      0.82      0.65      5573
               Folk       0.52      0.53      0.52      1613
            Hip-Hop       0.63      0.43      0.51      1350
       Instrumental       0.48      0.30      0.37      1797
      International       0.55      0.34      0.42       732
               Jazz       0.34      0.31      0.33       478
Old-Time / Historic       0.94      0.76      0.84       161
                Pop       0.32      0.27      0.30      1819
               Rock       0.62      0.77      0.69      5113
           Soul-RnB       0.00      0.00      0.00       199

          micro avg       0.57      0.61      0.59     25009
          macro avg   

  _warn_prf(average, modifier, msg_start, len(result))


### We see substantial improvement across the board
### Try upsampling

* However poorly supported genres still don't perform well.
* Multilabel upsampling is not well-defined
* our rudimentary attempt is simply to duplicate all tracks outside the top 3 one time.

In [17]:
mlb = MultiLabelBinarizer()
mlb.fit(t1[('track'),('top_genre_list')])

f1_train, f1_test, t1_train, t1_test = train_test_split(f1,t1, 
                                                    test_size=0.2,
                                                    random_state=RANDOM_STATE,
                                                    shuffle=True)




In [18]:
up = ~(t1_train[('track','top_genre1')].isin(['Experimental','Electronic','Rock']) |
    t1_train[('track','top_genre2')].isin(['Experimental','Electronic','Rock']))
t1_train.shape, t1_train[up].shape

((69720, 58), (16036, 58))

In [19]:
t2 = t1_train.append(t1_train[up])
f2 = f1_train.append(f1_train[up])
f2.shape, t2.shape

((85756, 518), (85756, 58))

In [20]:
X_train = f2
X_test = f1_test
y_train = mlb.transform(t2[('track'),('top_genre_list')])
y_test = mlb.transform(t1_test[('track'),('top_genre_list')])

####
scaler = MinMaxScaler(copy=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(y_train.shape, y_test.shape)
print(count_labels(y_train))
print(count_labels(y_test))

(85756, 14) (17431, 14)
{'Folk': 11017, 'Pop': 10529, 'Rock': 20441, 'Experimental': 22428, 'Electronic': 20817, 'Classical': 3608, 'Jazz': 2639, 'International': 4840, 'Instrumental': 10068, 'Hip-Hop': 8908, 'Country': 1613, 'Soul-RnB': 1173, 'Blues': 882, 'Old-Time / Historic': 1226}
{'Experimental': 5573, 'Electronic': 5267, 'Instrumental': 1797, 'Hip-Hop': 1350, 'Classical': 537, 'Rock': 5113, 'Folk': 1613, 'Jazz': 478, 'Pop': 1819, 'Blues': 139, 'International': 732, 'Soul-RnB': 199, 'Country': 231, 'Old-Time / Historic': 161}


In [21]:
cl2 = MLPClassifier(max_iter=1000, random_state=RANDOM_STATE)
cl2.fit(X_train, y_train)

MLPClassifier(max_iter=1000, random_state=53)

In [22]:
y_pred = cl2.predict(X_test)
my_metrics = metrics.classification_report( y_test, y_pred, target_names=mlb.classes_)
print(my_metrics)

                     precision    recall  f1-score   support

              Blues       1.00      0.05      0.10       139
          Classical       0.75      0.46      0.57       537
            Country       0.20      0.01      0.02       231
         Electronic       0.75      0.50      0.60      5267
       Experimental       0.70      0.55      0.62      5573
               Folk       0.63      0.43      0.51      1613
            Hip-Hop       0.59      0.50      0.54      1350
       Instrumental       0.61      0.12      0.20      1797
      International       0.62      0.29      0.40       732
               Jazz       0.75      0.09      0.15       478
Old-Time / Historic       0.95      0.74      0.83       161
                Pop       0.59      0.01      0.01      1819
               Rock       0.83      0.56      0.67      5113
           Soul-RnB       0.00      0.00      0.00       199

          micro avg       0.73      0.43      0.54     25009
          macro avg   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### We see a modest performance improvement over our initial attempt

* Try loosening the probability threshold as before

In [23]:
yp_pred = np.array(cl2.predict_proba(X_test))
for prob in np.linspace(.2,.5, 7):
    y_pred = (yp_pred > prob).astype(int)
    print(prob, metrics.f1_score(y_test, y_pred, average='macro'))

0.2 0.434711916882421
0.25 0.4323520233647122
0.3 0.42233055442781936
0.35 0.4129627547470914
0.4 0.4003222395468951
0.44999999999999996 0.3851358852324299
0.5 0.37289388125600276


In [35]:
y_pred = (yp_pred > .30).astype(int)
my_metrics = metrics.classification_report( y_test, y_pred, target_names=mlb.classes_)
print(my_metrics)

                     precision    recall  f1-score   support

              Blues       0.30      0.14      0.20       139
          Classical       0.60      0.55      0.58       537
            Country       0.19      0.33      0.24       231
         Electronic       0.62      0.76      0.68      5267
       Experimental       0.60      0.75      0.67      5573
               Folk       0.41      0.68      0.51      1613
            Hip-Hop       0.50      0.58      0.54      1350
       Instrumental       0.48      0.34      0.40      1797
      International       0.52      0.39      0.44       732
               Jazz       0.40      0.36      0.38       478
Old-Time / Historic       0.91      0.77      0.84       161
                Pop       0.34      0.35      0.34      1819
               Rock       0.65      0.75      0.69      5113
           Soul-RnB       0.22      0.06      0.09       199

          micro avg       0.56      0.64      0.60     25009
          macro avg   

  _warn_prf(average, modifier, msg_start, len(result))


### Again we see improvement

### Resize the network: hidden_layer_size=(300,)

NOTE: SLOW

In [25]:
cl3 = MLPClassifier(max_iter=1000, hidden_layer_sizes=(300,), alpha=0.001, random_state=RANDOM_STATE)
cl3.fit(X_train, y_train)

MLPClassifier(alpha=0.001, hidden_layer_sizes=(300,), max_iter=1000,
              random_state=53)

In [26]:
y_pred = cl3.predict(X_test)
my_metrics = metrics.classification_report( y_test, y_pred, target_names=mlb.classes_)
print(my_metrics)

                     precision    recall  f1-score   support

              Blues       0.50      0.06      0.11       139
          Classical       0.73      0.47      0.57       537
            Country       0.53      0.11      0.18       231
         Electronic       0.75      0.51      0.61      5267
       Experimental       0.78      0.44      0.56      5573
               Folk       0.69      0.36      0.47      1613
            Hip-Hop       0.47      0.62      0.54      1350
       Instrumental       0.47      0.36      0.41      1797
      International       0.68      0.32      0.44       732
               Jazz       0.56      0.22      0.31       478
Old-Time / Historic       0.93      0.83      0.88       161
                Pop       0.41      0.17      0.24      1819
               Rock       0.85      0.55      0.67      5113
           Soul-RnB       0.24      0.08      0.12       199

          micro avg       0.70      0.44      0.54     25009
          macro avg   

  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
yp_pred = np.array(cl3.predict_proba(X_test))
for prob in np.linspace(.2,.5, 7):
    y_pred = (yp_pred > prob).astype(int)
    print(prob, metrics.f1_score(y_test, y_pred, average='macro'))

0.2 0.46757995499835453
0.25 0.4748667384589266
0.3 0.4727577311930825
0.35 0.4701596211564342
0.4 0.46299077082335016
0.44999999999999996 0.45244240733630214
0.5 0.43649228916261634


In [36]:
y_pred = (yp_pred > .35).astype(int)
my_metrics = metrics.classification_report( y_test, y_pred, target_names=mlb.classes_)
print(my_metrics)

                     precision    recall  f1-score   support

              Blues       0.32      0.13      0.18       139
          Classical       0.64      0.53      0.58       537
            Country       0.20      0.29      0.24       231
         Electronic       0.64      0.71      0.68      5267
       Experimental       0.63      0.71      0.67      5573
               Folk       0.44      0.64      0.52      1613
            Hip-Hop       0.54      0.55      0.54      1350
       Instrumental       0.52      0.30      0.38      1797
      International       0.56      0.37      0.45       732
               Jazz       0.43      0.32      0.37       478
Old-Time / Historic       0.93      0.76      0.84       161
                Pop       0.36      0.29      0.32      1819
               Rock       0.69      0.71      0.70      5113
           Soul-RnB       0.26      0.05      0.08       199

          micro avg       0.59      0.60      0.60     25009
          macro avg   

  _warn_prf(average, modifier, msg_start, len(result))


### hidden layer size (400,)

NOTE: VERY SLOW!!!

In [29]:
cl4 = MLPClassifier(max_iter=1000, hidden_layer_sizes=(400,), alpha=0.001, random_state=RANDOM_STATE)
cl4.fit(X_train, y_train)

MLPClassifier(alpha=0.001, hidden_layer_sizes=(400,), max_iter=1000,
              random_state=53)

In [30]:
y_pred = cl4.predict(X_test)
my_metrics = metrics.classification_report( y_test, y_pred, target_names=mlb.classes_)
print(my_metrics)


                     precision    recall  f1-score   support

              Blues       0.47      0.10      0.17       139
          Classical       0.73      0.47      0.57       537
            Country       0.26      0.23      0.24       231
         Electronic       0.72      0.57      0.64      5267
       Experimental       0.71      0.58      0.64      5573
               Folk       0.53      0.54      0.54      1613
            Hip-Hop       0.64      0.46      0.53      1350
       Instrumental       0.57      0.19      0.29      1797
      International       0.66      0.30      0.42       732
               Jazz       0.50      0.23      0.32       478
Old-Time / Historic       0.95      0.75      0.84       161
                Pop       0.45      0.15      0.23      1819
               Rock       0.78      0.62      0.69      5113
           Soul-RnB       0.47      0.04      0.07       199

          micro avg       0.69      0.49      0.57     25009
          macro avg   

  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
yp_pred = np.array(cl4.predict_proba(X_test))
for prob in np.linspace(.2,.5, 7):
    y_pred = (yp_pred > prob).astype(int)
    print(prob, metrics.f1_score(y_test, y_pred, average='macro'))

0.2 0.46163336561900686
0.25 0.4698236549723124
0.3 0.4715551761091227
0.35 0.4672156038868742
0.4 0.46154914431051036
0.44999999999999996 0.45212814481496677
0.5 0.44024483731621894


In [41]:
y_pred = (yp_pred > .35).astype(int)
my_metrics = metrics.classification_report( y_test, y_pred, target_names=mlb.classes_)
print(my_metrics)

                     precision    recall  f1-score   support

              Blues       0.32      0.13      0.18       139
          Classical       0.64      0.53      0.58       537
            Country       0.20      0.29      0.24       231
         Electronic       0.64      0.71      0.68      5267
       Experimental       0.63      0.71      0.67      5573
               Folk       0.44      0.64      0.52      1613
            Hip-Hop       0.54      0.55      0.54      1350
       Instrumental       0.52      0.30      0.38      1797
      International       0.56      0.37      0.45       732
               Jazz       0.43      0.32      0.37       478
Old-Time / Historic       0.93      0.76      0.84       161
                Pop       0.36      0.29      0.32      1819
               Rock       0.69      0.71      0.70      5113
           Soul-RnB       0.26      0.05      0.08       199

          micro avg       0.59      0.60      0.60     25009
          macro avg   

  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
metrics.accuracy_score(y_test, y_pred,  normalize=True, sample_weight=None)

0.322987780391257