In [None]:
import pandas as pd
import numpy as np
import librosa
import random as rn
from tqdm import tqdm
import os
from collections import defaultdict, Counter
from scipy import signal
from keras import Input
from keras.engine import Model
from keras.utils import to_categorical
from keras.layers import Dense, TimeDistributed, Dropout, Bidirectional, Activation, CuDNNLSTM, Flatten, Permute, Conv2D
from keras import optimizers, losses, activations, models
from keras.optimizers import Adam
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers import concatenate
from keras.models import Model, load_model
from keras.layers import Input, Activation, Concatenate, Permute, Reshape, Flatten, Lambda, Dot, Softmax
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from kapre.utils import Normalization2D
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from imblearn.over_sampling import RandomOverSampler
from keras.regularizers import l2

In [None]:
DATA_DIR = '../data/wav/'
df_train = pd.read_csv('../data/train.csv')
train_dict = df_train.set_index('path')['word'].to_dict()

The following function extracts mfcc, delta and delta delta features from .wav files and pads them to the standard shape of (60,44). It then stacks the features to a 3d array and also outputs two dictionary with paths as keys and values as labels which are none for the test dictionary created.

In [None]:
def getMFCCfeatures():
    train_mfcc = []
    train_y = {}
    test_mfcc = []
    test_y = {}
    pad2d = lambda a, i: a[:, 0: i] if a.shape[1] > i else np.hstack((a, np.zeros((a.shape[0],i - a.shape[1]))))
    for fname in tqdm(os.listdir(DATA_DIR)[:100], desc='dir'):
        try:
            if '.wav' not in fname or 'dima' in fname:
                continue
            label = train_dict.get(fname)
            wav, sr = librosa.load(DATA_DIR + fname)
            mfcc = librosa.feature.mfcc(wav)
            mfcc1 = librosa.feature.delta(mfcc)
            mfcc2 = librosa.feature.delta(mfcc, order=2)
            fmfcc = np.vstack((mfcc, mfcc1, mfcc2))
            padded_mfcc = pad2d(fmfcc, 44)

            if label == None:
                test_mfcc.append(padded_mfcc)
                test_y[fname] = label
            else:
                train_mfcc.append(padded_mfcc)
                train_y[fname] = label
        except Exception as e:
            print (fname, e)
            raise

    return np.array(train_mfcc), np.array(test_mfcc), train_y, test_y

The following function preprocesses the data and the steps included are:
1. Encode the y labels to a one-hot encoded vector
2. Resample the data to add more examples of each label and overall ratio of majority to minority as 1.
3. Then the function reshapes the array to output a 4D array with an extra dimesnion added so that it can be fed to the neural network

In [None]:
def getPreProcessedData(train_, test_, y_train):
    encoder = LabelEncoder()
    l1 = np.array(list(y_train.values()))
    encoder.fit(l1)
    encoded_Y = encoder.transform(l1)
    mfcc2d = train_.reshape(train_.shape[0], train_.shape[1]*train_.shape[2])
    ros = RandomOverSampler(sampling_strategy='all',random_state=2019)
    mfcc_resampled, y_resampled = ros.fit_resample(mfcc2d, encoded_Y)
    mfcc_ori = np.reshape(mfcc_resampled, (-1, mfcc_raw.shape[1], mfcc_raw.shape[2]))
    
    X_train = mfcc_ori.reshape(mfcc_ori.shape[0], mfcc_ori.shape[1], mfcc_ori.shape[2], 1)
    X_test  = test_.reshape(test_.shape[0], test_.shape[1], test_.shape[2], 1)
    y_cat = np_utils.to_categorical(y_resampled)
    
    return X_train, X_test, y_cat, encoder


The model is defined as below with 3 convolution layers, interspersed with Dropouts layer and then the output is fed into BDLSTM layers and finally condensed with a time distributed layer. This is then flattened and passed into two dense layers and finally outputed.

In [None]:
def defineModel(input_):
    nclass=35
    x = Normalization2D(int_axis=0)(input_)
    x = Permute((2,1,3)) (x)

    x = Conv2D(10, (4,1) , activation='relu', padding='same', 
             kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001)) (x)

    x = Dropout(rate=0.4)(x)
    x = Conv2D(10, (4,1) , activation='relu', padding='same',
             kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001)) (x)

    x = Dropout(rate=0.25)(x)
    x = Conv2D(1, (4,1) , activation='relu', padding='same', 
             kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001)) (x)

    x = Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim') (x) 

    x = Bidirectional(CuDNNLSTM(512, return_sequences = True)) (x)
    x = Dropout(rate=0.25)(x)
    x = Bidirectional(CuDNNLSTM(512, return_sequences = True)) (x)

    x = TimeDistributed(Dense(1024)) (x)
    x = Dropout(rate=0.25)(x)

    x = Flatten()(x)
    x = Dense(1024, activation = 'sigmoid')(x)
    x = Dropout(rate=0.25)(x)
    x = Dense(128)(x)


    output = Dense(nclass, activation = 'softmax', name='output')(x)
    return output

This labels predicts the model outpur on the test set and then inverse transforms them into the labels given.

In [None]:
def getYLabels(testfeat, encoder):
    y_predict = model.predict(testfeat)
    y_enco    = np.argmax(y_predict, axis=1)
    y_classes = encoder.inverse_transform(y_enco)
    return y_classes

The test path index created earlier will not be in the same order as the test paths given so this function gets the code for the paths which can be used to rearrange the output labels in correspondence with the test paths as provided.

In [None]:
train_mfcc, test_mfcc, y_train, y_test = getMFCCfeatures()

In [None]:
Xtrain, test_feat, ytrain, encoder = getPreProcessedData(train_mfcc, test_mfcc, y_train)

The model is defined, compiled and fitted below. We use two keras callbacks of earlystopping and reducing LR when it plateaus.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xtrain, ytrain, test_size=0.05, random_state=2019)
input_shape = X[0].shape
inp = Input(shape=input_shape)
output = defineModel(inp)
model = Model(inputs=inp, outputs=[output])
opt = optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.99, amsgrad=True)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=7)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=5, min_lr=0.001)
model.summary()

model.fit(X, y, batch_size=128, validation_split=0.1, 
        epochs=100, shuffle=True, verbose=1, callbacks=[reduce_lr, es])