# Feature Extraction and Fully-connected Layers Training

Code in this notebook is used for feature extraction using CNNs trained in `Train Local Feature Extractors.ipynb` and the training of classifier based on extracted features. See each section below for detailed instruction.

In [None]:
import os
# Comment out the line below if not using GPU
os.environ['THEANO_FLAGS'] = "floatX=float32,device=cuda,exception_verbosity=high"
import theano

In [None]:
import keras

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
from keras.layers import Input, Dense, merge, Flatten, Dropout
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling2D
from keras.models import Model
from keras.regularizers import l2
import matplotlib.pyplot as plt
from keras.utils.visualize_util import plot
import os

## Define CNN Model

The function below provide a templete for defining a CNN model used as local feature extractor.

In [None]:
def gen_cnn(seg_length, pool_sizes_hori):
    '''
    The function for building CNNs.
    Each CNN model handles one segment of mel-spectrograms at a time.
    The size of mel-spectrogram sgements is defined by "seg_length"
    "seg_length" and pooling layer sizes are set adjusting to different scales.
    '''
    psh = [0] + pool_sizes_hori # padding at front for index alignment
    
    # input
    x = Input(shape=(1, 96, seg_length))

    # 1st conv layer
    conv1 = Convolution2D(32, 3, 3, border_mode='same', init='he_normal', name='conv1_{}'.format(seg_length), trainable=False)(x)
    conv1 = keras.layers.advanced_activations.ELU(alpha=1.0)(conv1)
    conv1 = MaxPooling2D(pool_size=(2, psh[1]))(conv1)
    avg1 = GlobalAveragePooling2D()(conv1)

    # 2nd conv layer
    conv2 = Convolution2D(32, 3, 3, border_mode='same', init='he_normal', name='conv2_{}'.format(seg_length), trainable=False)(conv1)
    conv2 = keras.layers.advanced_activations.ELU(alpha=1.0)(conv2)
    conv2 = MaxPooling2D(pool_size=(3, psh[2]))(conv2)
    avg2 = GlobalAveragePooling2D()(conv2)

    # 3rd conv layer
    conv3 = Convolution2D(32, 3, 3, border_mode='same', init='he_normal', name='conv3_{}'.format(seg_length), trainable=False)(conv2)
    conv3 = keras.layers.advanced_activations.ELU(alpha=1.0)(conv3)
    conv3 = MaxPooling2D(pool_size=(2, psh[3]))(conv3)
    avg3 = GlobalAveragePooling2D()(conv3)

    # 4th conv layer
    conv4 = Convolution2D(32, 3, 3, border_mode='same', init='he_normal', name='conv4_{}'.format(seg_length), trainable=False)(conv3)
    conv4 = keras.layers.advanced_activations.ELU(alpha=1.0)(conv4)
    conv4 = MaxPooling2D(pool_size=(2, psh[4]))(conv4)
    avg4 = GlobalAveragePooling2D()(conv4)

    # Concatenate 5 intermediate outputs
    concatenated = merge([avg1, avg2, avg3, avg4], mode='concat', concat_axis=1)
    
    # define model
    model = Model(input=x, output=concatenated)
    
    # load pre-trained weights
    model.load_weights(PATH_WEIGHTS, by_name=True)
    
    return model

## Load Dataset

Load "raw" dataset (spectrograms whose features to be extracted).
- `X_train`: Training data
- `Y_train`: Labels of training data
- `X_test`: Heldout test data
- `Y_test`: Labels of test data

In [None]:
# Load Train Dataset (X_train, Y_train)
X_train = np.load('./dataset/X_train.npy')
Y_train_pre = np.load('./dataset/Y_train.npy').astype(int)

Y_train = np.zeros((Y_train_pre.shape[0], 10))
Y_train[np.arange(Y_train_pre.shape[0]), Y_train_pre] = 1

# Load Test Dataset (X_test, Y_test)
X_test = np.load('./dataset/X_test.npy')
Y_test_pre = np.load('./dataset/Y_test.npy').astype(int)

Y_test = np.zeros((Y_test_pre.shape[0], 10))
Y_test[np.arange(Y_test_pre.shape[0]), Y_test_pre] = 1

print X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

## Feature Extraction

Code in this section are used for extracting features from all song clips in GTZAN using local feature extractors (4 CNN models) pre-trained in `Train Local Feature Extractors.ipynb`. Since the extraction process is quite slow, after extraction is done, features will be stored under the specific path.

In [None]:
def extract_feature(X, seg_lengths, pool_sizes_horis, flag_save=True, tr_te='tr'):
    '''
    Extract features in different scales for all mel-spectrograms.
    Inputs:
        - X: spectrograms to be processed
        - seg_lengths, pool_sizes_horis: parameters to build CNN models
        - flag_save: Set True to save extracted feature vactors
    Return:
        extracted features from all images in X
    '''
    # Build cnn model and load pre-trained weights
    models = []
    for i, seg_length in enumerate(seg_lengths):
        pool_sizes_hori = pool_sizes_horis[i] # select pool layer sizes
        cnn = gen_cnn(seg_length, pool_sizes_hori) # generate model
        PATH_WEIGHTS = './weights/local_cnn_{}.h5'.format(seg_length)
        cnn.load_weights(PATH_WEIGHTS, by_name=True) # load pre-trained weights
        models.append((cnn, seg_length))
    
    num_pics, length = X.shape[0], X.shape[3]
    num_models = len(models) # Number of CNNs
    new_X = np.zeros((num_pics, 128*num_models))
    for i, params in enumerate(models):
        cnn, seg_length = params # read model and corresponding seg_length
        features = np.zeros((num_pics, 128))
        for j in xrange(length/seg_length):
            feature = cnn.predict(X[:, :, :, j*seg_length:(j+1)*seg_length]) # extract feature using cnn
            features += feature
        features /= length/seg_length # compute mean on feature vectors among all segments
        assert features.shape == (num_pics, 128)
        new_X[:, i*128:(i+1)*128] = features 
        
    if flag_save:
        if not os.path.exists('./local_features/'):
            os.mkdir('./local_features/')
        if tr_te == 'tr':
            np.save('./local_features/X_train_extracted.npy', new_X)
        else:
            np.save('./local_features/X_test_extracted.npy', new_X)
    return new_X

In [None]:
# model parameters
seg_lengths = [30, 60, 120, 240]
pool_sizes_horis = [[2, 2, 2, 2], [3, 2, 2, 2], [3, 3, 2, 2], [4, 4, 3, 2]] # sizes of pooling layers (in horizontal direction)

# Call function to extract feature from each song clip
X_train_ext = extract_feature(X_train, seg_lengths, pool_sizes_horis)
X_test_ext = extract_feature(X_test, seg_lengths, pool_sizes_horis, tr_te='te')

In [None]:
# -- Or load the features if extraction has been done before
X_train_ext = np.load('./local_features/X_train_extracted.npy') # "ext": extracted feature
X_test_ext = np.load('./local_features/X_test_extracted.npy')

In [None]:
# Merge train & test data together for cross-validation
X_ext = np.concatenate((X_train_ext, X_test_ext), axis=0)
Y_ext = np.concatenate((Y_train, Y_test), axis=0)

## Train Classifier

Define a neural-network based classifier and train it on features extracted in `Feature Extraction` section.

In [None]:
def gen_classifier(lambd=0.2):
    '''
    Generate a classifier model with 2 hidden dense layers
    and dropout regularization.
    '''
    # Model input
    x = Input(shape=(512,))
    
    # First hidden layer
    dense1 = Dense(256, input_shape=[512], activation='relu')(x)
    dr1 = Dropout(lambd)(dense1)
    
    # Model output
    out = Dense(10, input_shape=[128], activation='softmax')(dr1)
    
    # Create model
    model = Model(input=x, output=out)
    
    return model

In [None]:
# Compile and Train Model
classifier = gen_classifier()
# classifier.summary()

In [None]:
# Train classifier
classifier.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_his = classifier.fit(X_train_ext, Y_train, validation_data=(X_test_ext, Y_test), nb_epoch=500, verbose=1)

In [None]:
# Save Training History (for plotting)
import pickle, datetime
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
with open('./train_logs/train_his_{}_{}'.format('FC', now), 'wb') as file_pi:
    pickle.dump(model_his.history, file_pi)

## Compute and Plot Accuracy per Genre

Similar implementation can be seen in "Multi-level CNN" part. 

Run all cells to get accuracy per genre.

In [None]:
from sklearn.model_selection import KFold
names = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
total = [0.0] * 10
correct = [0.0] * 10

def acc_per_genre(model_gen, X, Y, epoches=500):
    seed = 6
    kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
    for train, test in kfold.split(X, Y):
        # print 'One fold'
        model = model_gen()
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        # Fit the model
        model.fit(X[train], Y[train], nb_epoch=epoches, verbose=0)
        # make predictions on test data
        Y_pred = model.predict(X[test])
        Y_pred, Y_gold = np.argmax(Y_pred, axis=1), np.argmax(Y[test], axis=1)
        assert Y_pred.shape[0] == Y[test].shape[0]  
        for i in xrange(Y_gold.shape[0]):
            total[Y_gold[i]] += 1
            if Y_gold[i] == Y_pred[i]:
                correct[Y_gold[i]] += 1
    return

In [None]:
# Call function defined above to compute accuracies
acc_per_genre(gen_classifier, X_ext, Y_ext)

In [None]:
# Plot function
import matplotlib.pyplot as plt
def plt_genre_acc(names, percentage):
    N = len(names)
    
    ind = np.arange(N)  # the x locations for the groups
    width = 0.5     # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(ind, percentage, width) #, yerr=men_std)

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Accuracy')
    ax.set_title('Accuracy by genre')
    ax.set_xticks(ind)
    ax.set_xticklabels(names, rotation=45)

    plt.show()
    return fig

In [None]:
fig = plt_genre_acc(names, percentage)
fig.savefig('./acc_by_genre_p2.png')