# Ensemble Learning

A very well-studied field, the type of ensemble learning we'll be doing here is called Stacking (I think). We take the models that we've learned before, and we train another model (the meta-model) on the outputs of those models. If we have 3 models, and each model has 6 outputs, the meta-model has 18 inputs and 6 outputs. 

In [2]:
%load_ext autoreload
%autoreload 2

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Tensorflow
import tensorflow as tf
from tensorflow.python.tools import freeze_graph
from tensorflow.python.tools import optimize_for_inference_lib

# Keras
import keras
from keras import regularizers
from keras.models import Sequential
from keras.layers import (Activation, Dense, Dropout, Flatten, Conv2D, Conv1D, 
                          MaxPooling2D, GlobalAveragePooling2D, MaxPooling1D, Lambda, LSTM)
from keras.layers.normalization import BatchNormalization
from keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint
from keras.utils import to_categorical, multi_gpu_model
import keras.backend as K

import librosa
import multiprocessing as mp
import numpy as np
import scipy.io.wavfile
from scipy.fftpack import dct
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
import time
from pprint import pprint
import uuid
import glob
import math

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
L6_RAW_DATA_DIR = "data/"
MFCC_PROCESSED_DATA_DIR = "mfcc-processed-data/"
AMP_PROCESSED_DATA_DIR = "amp-processed-data/"
ENSEMBLE_PROCESSED_DATA_DIR = "ensemble-processed-data/"
MODEL_NAME = 'EdgeAnalytics_AudioSet'
AUDIO_LENGTH = 44100

batch_size = 128
epochs = 20
channel = 1
verbose = 1
l6_classes = 6
nmfcc = 128
nmels = 128
AUDIO_LENGTH = 44100
hop_length = 512
n_fft = 1024
nframe = int(math.ceil(AUDIO_LENGTH / hop_length))

In [4]:
def get_labels(path):
    labels = [i for i in sorted(os.listdir(path)) if i[0] != "."]
    label_indices = np.arange(0, len(labels))
    return labels, label_indices, to_categorical(label_indices)

def wav2mfcc(file_path):
    mfcc_vectors = []
    audio_buf,_ = librosa.load(file_path, mono=True, sr=44100)
    audio_buf = (audio_buf - np.mean(audio_buf)) / np.std(audio_buf)
    
    remaining_buf = audio_buf.copy()
    while remaining_buf.shape[0] > AUDIO_LENGTH:
        # Add the first AUDIO_LENGTH of the buffer as a new vector to train on
        new_buf = remaining_buf[ : AUDIO_LENGTH ]
        mfcc = librosa.feature.mfcc(new_buf, sr=44100,S=None, n_mfcc=nmfcc, n_fft=n_fft, hop_length=hop_length, n_mels=nmels)
#         print(mfcc.shape)
        mfcc_vectors.append(mfcc)

        # Shrink the buffer by 1/2 * AUDIO_LENGTH
        remaining_buf = remaining_buf[ int(AUDIO_LENGTH / 2) : ]
#         print("Length of remaining buf after shrink: {}".format(len(remaining_buf)))

    # Whatever is left, pad and stick in the training data
    remaining_buf = np.concatenate((remaining_buf, np.zeros(shape=(AUDIO_LENGTH - len(remaining_buf)))))
    mfcc = librosa.feature.mfcc(remaining_buf,sr=44100,S=None, n_mfcc=nmfcc, n_fft=n_fft, hop_length=hop_length,n_mels=nmels)
    mfcc_vectors.append(mfcc)
    
    return mfcc_vectors

In [5]:
def label_to_amplitude_vecs(args) -> None:
    label, input_path, output_path, tqdm_position = args

    # Get all audio files for this label
    wavfiles = [os.path.join(input_path, label, wavfile) for wavfile in os.listdir(os.path.join(input_path, label))]

    # tqdm is amazing, so print all the things this way
    print(" ", end="", flush=True)
    twavs = tqdm(wavfiles, position=tqdm_position)
    
    vectors = []
    for i, wavfile in enumerate(twavs):
        # Load the audio file; this also works for .flac files
        audio_buf, _ = librosa.load(wavfile, mono=True, sr=44100)
        audio_buf = audio_buf.reshape(-1, 1)
        audio_buf = (audio_buf - np.mean(audio_buf)) / np.std(audio_buf)            
        remaining_buf = audio_buf.copy()
        while remaining_buf.shape[0] > AUDIO_LENGTH:
            # Add the first AUDIO_LENGTH of the buffer as a new vector to train on
            new_buf = remaining_buf[ : AUDIO_LENGTH ]
            vectors.append(new_buf)
            
            # Shrink the buffer by 1/2 * AUDIO_LENGTH
            remaining_buf = remaining_buf[ int(AUDIO_LENGTH / 2) : ]
#             print("Length of remaining buf after shrink: {}".format(len(remaining_buf)))

            
        # Whatever is left, pad and stick in the training data
        remaining_buf = np.concatenate((remaining_buf, np.zeros(shape=(AUDIO_LENGTH - len(remaining_buf), 1))))
        vectors.append(remaining_buf)
        
#         print("After wavefile {}: {}".format(i, len(vectors)))
        
        # Update tqdm
        twavs.set_description("Label - '{}'".format(label))
        twavs.refresh()
    np_vectors = np.array(vectors)
    np.save(os.path.join(output_path, label + '.npy'), np_vectors)
    return np_vectors.shape

def label_to_mfcc_vecs(args) -> None:
    label, input_path, output_path, tqdm_position = args
    
    vectors = []

    wavfiles = [os.path.join(input_path, label, wavfile) for wavfile in os.listdir(os.path.join(input_path, label))]
    
    # tqdm is amazing, so print all the things this way
    print(" ", end="", flush=True)
    twavs = tqdm(wavfiles, position=tqdm_position)
    for i, wavfile in enumerate(twavs):
        vectors_for_file = wav2mfcc(wavfile)
        for v in vectors_for_file:
            vectors.append(v)
#         print("After wavefile {}: {}".format(i, len(vectors)))
        
        # Update tqdm
        twavs.set_description("Label - '{}'".format(label))
        twavs.refresh()
#     np.delete(vectors, 0)  # deletes first zero entry    
    np_vectors = np.array(vectors)
    np.save(os.path.join(output_path, label + '.npy'), np_vectors)
    return np_vectors.shape   
    
def process_data_amplitude(input_path, output_path):    
    labels, _, _ = get_labels(input_path)
    pool = mp.Pool()
    result = pool.map(label_to_amplitude_vecs, 
                     [(label, input_path, output_path, tqdm_position) 
                          for tqdm_position, label in enumerate(labels)])
    pool.close()
    return result

def process_data_mfcc(input_path, output_path):    
    labels, _, _ = get_labels(input_path)
    pool = mp.Pool()
    result = pool.map(label_to_mfcc_vecs, 
                     [(label, input_path, output_path, tqdm_position) 
                          for tqdm_position, label in enumerate(labels)])
    pool.close()
    return result

In [None]:
labels, _, _ = get_labels(L6_RAW_DATA_DIR)
for tqdm_position, label in enumerate(labels):
    shape = label_to_amplitude_vecs((label, L6_RAW_DATA_DIR, AMP_PROCESSED_DATA_DIR, tqdm_position))
    print(shape)

 

HBox(children=(IntProgress(value=0, max=202), HTML(value='')))


(4255, 44100, 1)
 

HBox(children=(IntProgress(value=0, max=286), HTML(value='')))

(2930, 44100, 1)
 

HBox(children=(IntProgress(value=0, max=169), HTML(value='')))

(911, 44100, 1)
 

HBox(children=(IntProgress(value=0, max=146), HTML(value='')))

(2204, 44100, 1)
 

HBox(children=(IntProgress(value=0, max=407), HTML(value='')))

(3043, 44100, 1)
 

HBox(children=(IntProgress(value=0, max=398), HTML(value='')))

In [None]:
labels, _, _ = get_labels(L6_RAW_DATA_DIR)
for tqdm_position, label in enumerate(labels):
    shape = label_to_mfcc_vecs((label, L6_RAW_DATA_DIR, MFCC_PROCESSED_DATA_DIR, tqdm_position))
    print(shape)

In [None]:
# We load the amplitude model and create its training data
process_data_amplitude(L6_RAW_DATA_DIR, AMP_PROCESSED_DATA_DIR)

In [None]:
# We load the MFCC model and create its training data
process_data_mfcc(L6_RAW_DATA_DIR, MFCC_PROCESSED_DATA_DIR)

In [None]:
mfcc_model = keras.models.load_model("out/mfcc-l6-acc94.hdf5")

In [None]:
amp_model = keras.models.load_model("out/amplitude-l6-acc91.hdf5")

In [None]:
def vecs_to_predictions(model, vecs):
    print(vecs.shape)
    preds = model.predict(vecs)
    print(preds.shape)
    return preds

def load_processed_data(labels, processed_data_path):    
    # Getting first arrays
    X = np.load(os.path.join(processed_data_path, labels[0] + '.npy'))
    y = np.zeros(X.shape[0])

    # Append all of the dataset into one single array, same goes for y
    for i, label in tqdm(enumerate(labels[1:])):
        
        x = np.load(os.path.join(processed_data_path, label + '.npy'))
        print(os.path.join(processed_data_path, label + '.npy'))
        print (X.shape)
        print (x.shape)
        X = np.vstack((X, x))
        y = np.append(y, np.full(x.shape[0], fill_value= (i + 1)))

    assert X.shape[0] == len(y)

    return X, y

In [None]:
amp_predictions = np.load(os.path.join(ENSEMBLE_PROCESSED_DATA_DIR, "amplitude-predictions.npy"))
mfcc_predictions = np.load(os.path.join(ENSEMBLE_PROCESSED_DATA_DIR, "mfcc-predictions.npy"))
merged_data = np.load(os.path.join(ENSEMBLE_PROCESSED_DATA_DIR, "merged-predictions.npy"))

In [None]:
# Amplitude data
all_labels, _, _ = get_labels(L6_RAW_DATA_DIR)
amp_inputs, amp_labels = load_processed_data(all_labels, AMP_PROCESSED_DATA_DIR)


# MFCC data
all_labels, _, _ = get_labels(L6_RAW_DATA_DIR)
mfcc_inputs, mfcc_labels = load_processed_data(all_labels, MFCC_PROCESSED_DATA_DIR)
# MFCCs require reshaping
mfcc_inputs = mfcc_inputs.reshape(mfcc_inputs.shape[0], nmfcc, nframe, channel)
mfcc_labels = to_categorical(mfcc_labels)


# Get predictions
amp_predictions = vecs_to_predictions(amp_model, amp_inputs)
mfcc_predictions = vecs_to_predictions(mfcc_model, mfcc_inputs)


# Convert predictions to input features to ensemble
merged_data = np.array([ np.array([ *i, *j ]) for i, j in zip(amp_predictions, mfcc_predictions) ])


# Save 
np.save(os.path.join(ENSEMBLE_PROCESSED_DATA_DIR, "amplitude-predictions.npy"), amp_predictions)
np.save(os.path.join(ENSEMBLE_PROCESSED_DATA_DIR, "mfcc-predictions.npy"), mfcc_predictions)
np.save(os.path.join(ENSEMBLE_PROCESSED_DATA_DIR, "merged-predictions.npy"), merged_data)

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(merged_data, mfcc_labels, test_size=0.50, random_state=42, shuffle=True)

In [None]:
def get_ensemble():
    m = Sequential()
    m.add(Dense(50, activation='relu', input_shape=[12]))
    m.add(Dropout(0.25))
    m.add(Dense(50, activation='relu'))
    m.add(Dropout(0.25))
    m.add(Dense(50, activation='relu'))
    m.add(Dropout(0.25))
    m.add(Dense(6, activation='softmax', name='label'))
    return m

def train(model, X_train, y_train_hot, X_test, y_test_hot):
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    print(model.summary())
    
    reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=10, min_lr=0.00001, verbose=1)
    mcp_save = ModelCheckpoint('out/ensemble.{epoch:02d}-{val_acc:.2f}.hdf5', save_best_only=True, monitor='val_acc', mode='max')
    model.fit(X_train, 
              y_train_hot, 
              batch_size=batch_size, 
              epochs=epochs, 
              verbose=verbose, 
              validation_data=(X_test, y_test_hot),
              callbacks=[reduce_lr, mcp_save],
              shuffle=True)

In [None]:
keras.backend.clear_session()

In [None]:
epochs = 100
model = get_ensemble()
train(model, X_train, y_train, X_test, y_test)
model.save("out/ensemble.hdf5")

In [None]:
converter = tf.lite.TFLiteConverter.from_keras_model_file('out/ensemble.25-0.97.hdf5')
tfmodel = converter.convert()
open ("out/ensemble-l6-acc97.tflite" , "wb") .write(tfmodel)