In [1]:
import librosa as lb
import os
import numpy as np
import sklearn as sk

from glob import glob
from keras.optimizers import Adam
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, Flatten
from librosa import load
from librosa.feature import mfcc
from sklearn.model_selection import train_test_split

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
DEFAULT_BITRATE = 22050
DESIRED_DURATION = 25
PADDED_LENGTH = DEFAULT_BITRATE * DESIRED_DURATION
NUM_MFCCS = 64

In [3]:
def load_audio(folder):
    # load original mp3s
    loaded_mp3_files = [load(f) for f in glob(folder + '*.mp3')[:40]]
    
    # make same length
    fix_length_mp3 = normalize_audio(loaded_mp3_files)
    
    mfccs = [mfcc(y=y, sr=DEFAULT_BITRATE, n_mfcc=NUM_MFCCS) for y in fix_length_mp3]
    return mfccs

In [4]:
def normalize_audio(source_files):
#     fixed_lengths = [lb.util.fix_length(y, PADDED_LENGTH) for y, _ in source_files]
#     return [(f - np.mean(f)) / np.std(f) for f in fixed_lengths]
    return [lb.util.fix_length(y, PADDED_LENGTH) for y, _ in source_files]

In [5]:
def flatten_input(input):
    return np.array([x.reshape(x.shape[0] * x.shape[1]) for x in input])

In [6]:
us_X = load_audio('../data/speech-accent-archive/recordings/usa/')
uk_X = load_audio('../data/speech-accent-archive/recordings/uk/')
hk_X = load_audio('../data/speech-accent-archive/recordings/hongkong/')
cn_X = load_audio('../data/speech-accent-archive/recordings/china/')
ger_X = load_audio('../data/speech-accent-archive/recordings/germany/')

In [7]:
us_X[0].shape
us_X = flatten_input(us_X)
uk_X = flatten_input(uk_X)
hk_X = flatten_input(hk_X)
cn_X = flatten_input(cn_X)
ger_X = flatten_input(ger_X)

In [8]:
X = np.concatenate([us_X, uk_X, hk_X, cn_X, ger_X])
Y = np.zeros(len(us_X) + len(uk_X) + len(hk_X) + len(cn_X) + len(ger_X))
Y[21:40] = 1
Y[41:60] = 2
Y[61:80] = 3
Y[81:100] = 4

In [10]:
Y_categorical = to_categorical(Y)
print(X.shape)
print(Y_categorical.shape)

(175, 68928)
(175, 5)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y_categorical, test_size=0.2, random_state=420)

In [12]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(140, 68928)
(140, 5)
(35, 68928)
(35, 5)


In [13]:
def ff_model(X_train, y_train, X_test, y_test):
    X_val = X_train[:15]
    y_val = y_train[:15]
    X_train_true = X_train[15:]
    y_train_true = y_train[15:]
    
    model = Sequential()
    model.add(Dense(1024, activation='relu', input_shape=(68928,)))
    model.add(Dropout(0.3))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(5, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(x=X_train_true, y=y_train_true, epochs=16, validation_data=(X_val, y_val))
    
    loss, accuracy = model.evaluate(X_test, y_test)
    print('Test loss: %s' % loss)
    print('Test accuracy: %s' % accuracy)
    
    return model

In [14]:
model = ff_model(X_train, y_train, X_test, y_test)

Train on 125 samples, validate on 15 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Test loss: 7.82878938402
Test accuracy: 0.514285715137


In [21]:
def cnn_model(X_train, y_train, X_test, y_test):
    X_val = X_train[:15]
    y_val = y_train[:15]
    X_train_true = X_train[15:]
    y_train_true = y_train[15:]
    
    model = Sequential()
    model.add(Conv1D(8, 7, activation='relu', input_shape=(68928, 1)))
    model.add(MaxPool1D())
    model.add(Dropout(0.3))
    model.add(Conv1D(16, 7, activation='relu'))
    model.add(MaxPool1D())
    model.add(Dropout(0.2))
    model.add(Conv1D(32, 7, activation='relu'))
    model.add(MaxPool1D())
    model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(5, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(x=X_train_true, y=y_train_true, epochs=16, validation_data=(X_val, y_val))
    
    loss, accuracy = model.evaluate(X_test, y_test)
    print('Test loss: %s' % loss)
    print('Test accuracy: %s' % accuracy)
    
    return model

In [None]:
cnn_model = cnn_model(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), 
                      y_train, 
                      X_test.reshape(X_test.shape[0], X_test.shape[1], 1), 
                      y_test)

Train on 125 samples, validate on 15 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
