In [4]:
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline
plt.style.use('ggplot')

In [5]:
data_dir = "npy"

mfcc=True
if mfcc:
    data_dir = os.path.join(data_dir, 'mfcc')
else:
    data_dir = os.path.join(data_dir, 'all')
print(data_dir)

def add_folds(data_dir, mfcc=False):
    num_folds = len(os.listdir(data_dir)) // 2
        
    for k in range(num_folds-3):
        fold_name = 'fold_' + str(k)
        print("\nAdding " + fold_name)
        if mfcc:
            fold_name += '_mfcc'
        feature_file = os.path.join(data_dir, fold_name + '_x.npy')
        labels_file = os.path.join(data_dir, fold_name + '_y.npy')
        loaded_features = np.load(feature_file)
        loaded_labels = np.load(labels_file)
        print("New Features: ", loaded_features.shape)

        if k > 0:
            features = np.concatenate((features, loaded_features))
            labels = np.concatenate((labels, loaded_labels))
        else:
            features = loaded_features
            labels = loaded_labels
    return features, labels

train_x, train_y = add_folds(data_dir, mfcc=mfcc)

# use a fold for train-dev
valid_fold_name = 'fold_7'
if mfcc:
    valid_fold_name += '_mfcc'
feature_file = os.path.join(data_dir, valid_fold_name + '_x.npy')
labels_file = os.path.join(data_dir, valid_fold_name + '_y.npy')
train_dev_x = np.load(feature_file)
train_dev_y = np.load(labels_file) 

# use a fold for dev
valid_fold_name = 'fold_8'
if mfcc:
    valid_fold_name += '_mfcc'
feature_file = os.path.join(data_dir, valid_fold_name + '_x.npy')
labels_file = os.path.join(data_dir, valid_fold_name + '_y.npy')
dev_x = np.load(feature_file)
dev_y = np.load(labels_file) 

# and a fold for testing
test_fold_name = 'fold_9'
if mfcc:
    test_fold_name += '_mfcc'
feature_file = os.path.join(data_dir, test_fold_name + '_x.npy')
labels_file = os.path.join(data_dir, test_fold_name + '_y.npy')
test_x = np.load(feature_file)
test_y = np.load(labels_file)

# # One hot encode labels
ohe = preprocessing.OneHotEncoder(sparse=False)
ohe.fit(train_y.reshape(-1, 1))

train_y = ohe.transform(train_y.reshape((-1, 1)))
num_labels = len(train_y[1])
train_dev_y = ohe.transform(train_dev_y.reshape((-1, 1)))
dev_y = ohe.transform(dev_y.reshape((-1, 1)))
test_y = ohe.transform(test_y.reshape((-1, 1)))

print(f"\nTraining Set: {train_x.shape}, Labels: {train_y.shape}")
print(f"Train-dev Set: {train_dev_x.shape}, Labels: {train_dev_y.shape}")
print(f"Dev Set: {dev_x.shape}, Labels: {dev_y.shape}")
print(f"Test Set: {test_x.shape}, Labels: {test_y.shape}")


npy/mfcc

Adding fold_0
New Features:  (2216, 44)

Adding fold_1
New Features:  (1954, 44)

Adding fold_2
New Features:  (2162, 44)

Adding fold_3
New Features:  (2120, 44)

Adding fold_4
New Features:  (2136, 44)

Adding fold_5
New Features:  (2206, 44)

Adding fold_6
New Features:  (2155, 44)

Training Set: (14949, 44), Labels: (14949, 6)
Train-dev Set: (2000, 44), Labels: (2000, 6)
Dev Set: (2436, 44), Labels: (2436, 6)
Test Set: (2060, 44), Labels: (2060, 6)


# Train 5 layer ffn

In [6]:
import tensorflow as tf
import keras as K
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam, SGD
from keras.callbacks import EarlyStopping

In [7]:
tf.set_random_seed(0)
np.random.seed(0)

def create_model(activation='relu', dropout_rate=0.2):
    model = Sequential()
    n = train_x.shape[1]
    model.add(Dense(n, input_dim=n, activation=activation))
    model.add(Dense(400, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(200, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_labels, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=SGD(), metrics=['acc'])
    return model

model = create_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 44)                1980      
_________________________________________________________________
dense_2 (Dense)              (None, 400)               18000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 200)               80200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 1206      
Total params: 101,386
Trainable params: 101,386
Non-trainable params: 0
_________________________________________________________________


In [9]:
earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
earlystop_acc = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='auto')
history = model.fit(train_x, train_y, 
                    validation_data=(dev_x, dev_y), 
                    callbacks=[earlystop, earlystop_acc], 
                    epochs=5, 
                    batch_size=20)


Train on 14949 samples, validate on 2436 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 00004: early stopping
Epoch 00004: early stopping


In [10]:
from sklearn import metrics 
from keras.utils import np_utils

# Accuracy
score, accuracy = model.evaluate(test_x, test_y, batch_size=28)
print("\nAccuracy = {:.2f}".format(accuracy))



Accuracy = 0.83


In [11]:
import librosa

def windows(data, window_size):
    start = 0
    while start < len(data):
        yield start, start + window_size
        start += (window_size // 2)

def extract_features_mfcc(file_name, bands=60, frames=32, n_mfcc=42):
    """
    Generate multiple windows to be used as examples in the sound file. 
    """
    window_size = 512 * (frames - 1)
    X, sr = librosa.load(file_name)
    features = []

    for (start, end) in windows(X, window_size):
        if end > len(X):
            break
        curr_win_size = len(X[start:end])
        if(curr_win_size != window_size):
            break
        signal = X[start:end]
        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)  # mfcc.shape = (42, 1002) 
        mfccs = np.mean(mfcc.T,axis=0)  # mean along mfcc columns mfccs.shape = (42, )
        mfcc_ds = np.mean(librosa.feature.delta(mfcc.T, axis=0))
        mfcc_d_ds = np.mean(librosa.feature.delta(mfcc.T, order=2))
        feature_vecs = np.hstack([mfccs, mfcc_ds, mfcc_d_ds])
        features.append(feature_vecs)
    return features

sample_filename = "data/speech-accent-archive/recordings/afghanistan/dari1.mp3"
features = extract_features_mfcc(sample_filename)
f = features[0]
print(f)
model.predict(np.array([f]))

[ -5.14738010e+02   3.15824851e+01   4.11918949e-01   1.14085354e+01
   4.92827374e+00  -4.00655012e+00   2.61576622e+00  -1.53363998e+00
  -9.63345583e-01  -3.68915570e+00  -1.45786379e+00   1.06148858e+00
  -2.06182512e+00   2.88264994e-01  -2.18621514e-02   1.29873050e+00
   6.26811781e-02   3.60362166e-01  -5.79874571e+00   8.62707083e-01
  -1.73691011e+00  -1.63855675e+00  -1.98191873e+00   3.67625277e-01
  -3.11123766e+00   2.20682403e+00  -1.88472337e+00  -4.98203053e-01
  -1.39676903e-01   3.58229408e+00  -1.66338026e-01   5.04445329e+00
   2.72010043e+00   4.41178897e+00   1.31283229e+00   1.11532746e+00
  -3.82165983e-02   4.65353289e-01   4.52625281e-01   7.75063905e-01
  -2.46275152e+00   2.10422973e+00   3.82571361e-01  -1.82751553e-02]


array([[ 0.10391551,  0.16527979,  0.11279739,  0.19691423,  0.29134184,
         0.09986485]], dtype=float32)