In [24]:
import librosa as lb
import os
import numpy as np
import sklearn as sk

from glob import glob
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
from librosa import load
from librosa.feature import mfcc
from sklearn.model_selection import train_test_split

In [4]:
DEFAULT_BITRATE = 22050
DESIRED_DURATION = 25
PADDED_LENGTH = DEFAULT_BITRATE * DESIRED_DURATION

In [5]:
def load_audio(folder):
    # load original mp3s
    loaded_mp3_files = [load(f) for f in glob(folder + '*.mp3')[:40]]
    
    # make same length
    fix_length_mp3 = normalize_audio(loaded_mp3_files)
    
    mfccs = [mfcc(y=y, sr=DEFAULT_BITRATE, n_mfcc=40) for y in fix_length_mp3]
    return mfccs

In [6]:
def normalize_audio(source_files):
    fixed_lengths = [lb.util.fix_length(y, PADDED_LENGTH) for y, _ in source_files]
    return [(f - np.mean(f)) / np.std(f) for f in fixed_lengths]

In [7]:
def flatten_input(input):
    return np.array([x.reshape(x.shape[0] * x.shape[1]) for x in input])

In [8]:
us_X = load_audio('../data/speech-accent-archive/recordings/usa/')
uk_X = load_audio('../data/speech-accent-archive/recordings/uk/')
hk_X = load_audio('../data/speech-accent-archive/recordings/hongkong/')
cn_X = load_audio('../data/speech-accent-archive/recordings/china/')
ger_X = load_audio('../data/speech-accent-archive/recordings/germany/')

In [9]:
us_X[0].shape
us_X = flatten_input(us_X)
uk_X = flatten_input(uk_X)
hk_X = flatten_input(hk_X)
cn_X = flatten_input(cn_X)
ger_X = flatten_input(ger_X)

In [12]:
X = np.concatenate([us_X, uk_X, hk_X, cn_X, ger_X])
Y = np.zeros(len(us_X) + len(uk_X) + len(hk_X) + len(cn_X) + len(ger_X))
Y[21:40] = 1
Y[41:60] = 2
Y[61:80] = 3
Y[81:100] = 4

In [25]:
Y_categorical = to_categorical(Y)
print(Y_categorical.shape)

(175, 5)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, Y_categorical, test_size=0.1, random_state=1337)

In [27]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(157, 43080)
(157, 5)
(18, 43080)
(18, 5)


In [30]:
def ff_model(X_train, y_train, X_test, y_test):
    X_val = X_train[:17]
    y_val = y_train[:17]
    X_train_true = X_train[17:]
    y_train_true = y_train[17:]
    
    model = Sequential()
    model.add(Dense(1024, activation='relu', input_shape=(43080,)))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(5, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(x=X_train_true, y=y_train_true, epochs=10, validation_data=(X_val, y_val))
    
    loss, accuracy = model.evaluate(X_test, y_test)
    print('Test loss: %s' % loss)
    print('Test accuracy: %s' % accuracy)
    
    return model

In [None]:
model = ff_model(X_train, y_train, X_test, y_test)

Train on 140 samples, validate on 17 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
 32/140 [=====>........................] - ETA: 3s - loss: 2.3070 - acc: 0.6250