In [1]:
import glob
import os
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

  return f(*args, **kwds)


# Extract features

In [168]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield start, start + window_size
        start += (window_size // 2)

def extract_feature_array(filename, bands=60, frames=41):
    window_size = 512 * (frames-1)
    log_specgrams = []
    sound_clip,s = librosa.load(filename)        
    for (start, end) in windows(sound_clip, window_size):
        start = int(start)
        end = int(end)
        if(len(sound_clip[start:end]) == window_size):
            signal = sound_clip[start:end]
            melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
            logspec = librosa.logamplitude(melspec)
            logspec = logspec.T.flatten()[:, np.newaxis].T
            log_specgrams.append(logspec)
            
    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    return np.array(features)

sample_filename = 'data/speech-accent-archive/recordings/afghanistan/dari2.mp3'
features = extract_feature_array(sample_filename)
data_points, _ = librosa.load(sample_filename)
print ('IN: Initial Data Points =', len(data_points))
print ('OUT: Total features =', np.shape(features))

IN: Initial Data Points = 918554
OUT: Total features = (88, 60, 41, 2)


In [15]:
from sklearn import preprocessing

def iter_label_files(parent_dir):
    for root, dirpaths, fnames in os.walk(parent_dir):
        if len(dirpaths) > 0:
            continue
        label = root.split('/')[-1]
        yield root, label, fnames

parent_dir = 'data/speech-accent-archive/recordings/'
labels = []
for _, label, _ in iter_label_files(parent_dir):
    labels.append(label)
print(f'{len(labels)} labels found, 1st 5: {labels[:5]}')

176 labels found, 1st 5: ['afghanistan', 'albania', 'algeria', 'andorra', 'angola']


# Classes

In [123]:
cap = 0
counts = []
for _, label, fnames in iter_label_files(parent_dir):
    counts.append(len(fnames))

top_indices_count = [i_count for i_count in sorted(enumerate(counts), key=lambda x:x[1], reverse=True)][4:10]
most_freq_egs = [labels[i] for (i, count) in top_indices_count if count >= cap]
print(most_freq_egs)    

['canada', 'south_korea', 'brazil', 'belgium', 'turkey', 'poland']


# Save features

In [169]:
import random 


def get_label_enc(fn, le):
    label_txt = fn.split('/')[-2]  # y_i
    label_int = le.transform([label_txt])
    return label_int[0]


def extract_feature(fnames, le, bands=60, frames=41):
    """
    Extract features from filenames to features and labels arrays
    """
    window_size = 512 * (frames-1)
    log_specgrams = []
    labels = []
    for fn in fnames:
        X,s = librosa.load(fn)
        label_enc = get_label_enc(fn, le)
        for (start, end) in windows(X, window_size):
            labels.append(label_enc)
            curr_win_size = len(X[start:end])
            if(curr_win_size != window_size):
                break
            signal = X[start:end]
            melspec = librosa.feature.melspectrogram(signal, n_mels=bands)
            logspec = librosa.logamplitude(melspec)
            logspec = logspec.T.flatten()[:, np.newaxis].T
            log_specgrams.append(logspec)
    log_specgrams = np.array(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features = np.concatenate((log_specgrams, np.zeros(log_specgrams.shape)), axis=3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    return np.array(features), np.array(labels, dtype='int')


def get_subset_fnames(cap=10):
    fnames = glob.glob('**/*.mp3', recursive=True)
    subset_fnames = []
    for root, label, fnames in iter_label_files(parent_dir):
        if label not in most_freq_egs:
            continue
        for fn in fnames[:cap]:
            subset_fnames.append(os.path.join(root, fn))
    random.seed(1)
    random.shuffle(subset_fnames)
    return subset_fnames


def save_folds(save_dir, le, num_folds=10):
    fnames = get_subset_fnames()
    egs_per_fold = len(fnames) // num_folds
    curr = 0
    for k in range(num_folds):
        fold_name = 'fold_' + str(k)
        
        print('Saving ' + fold_name)
        start_i = k * egs_per_fold
        end_i = (k + 1) * egs_per_fold
        fnames_fold = fnames[start_i: end_i]
        features, labels = extract_feature(fnames_fold, le)
        
        print(f'Features of {fold_name} = {features.shape}')
        print(f'Labels of {fold_name} = {labels.shape}')

        feature_file = os.path.join(save_dir, fold_name + '_x.npy')
        labels_file = os.path.join(save_dir, fold_name + '_y.npy')

        random.seed(1)
        random.shuffle(features)
        random.shuffle(labels)

        np.save(feature_file, features)
        np.save(labels_file, labels)
        print('Saved ' + feature_file)
        print('Saved ' + labels_file + '\n')
        

le = preprocessing.LabelEncoder()
le.fit(most_freq_egs)
save_dir=os.path.join('npy', 'spec')
save_folds(save_dir, le)

Saving fold_0
Features of fold_0 = (297, 60, 41, 2)
Labels of fold_0 = (303,)
Saved npy/spec/fold_0_x.npy
Saved npy/spec/fold_0_y.npy

Saving fold_1
Features of fold_1 = (306, 60, 41, 2)
Labels of fold_1 = (312,)
Saved npy/spec/fold_1_x.npy
Saved npy/spec/fold_1_y.npy

Saving fold_2
Features of fold_2 = (376, 60, 41, 2)
Labels of fold_2 = (382,)
Saved npy/spec/fold_2_x.npy
Saved npy/spec/fold_2_y.npy

Saving fold_3
Features of fold_3 = (357, 60, 41, 2)
Labels of fold_3 = (363,)
Saved npy/spec/fold_3_x.npy
Saved npy/spec/fold_3_y.npy

Saving fold_4
Features of fold_4 = (321, 60, 41, 2)
Labels of fold_4 = (327,)
Saved npy/spec/fold_4_x.npy
Saved npy/spec/fold_4_y.npy

Saving fold_5
Features of fold_5 = (336, 60, 41, 2)
Labels of fold_5 = (342,)
Saved npy/spec/fold_5_x.npy
Saved npy/spec/fold_5_y.npy

Saving fold_6
Features of fold_6 = (341, 60, 41, 2)
Labels of fold_6 = (347,)
Saved npy/spec/fold_6_x.npy
Saved npy/spec/fold_6_y.npy

Saving fold_7
Features of fold_7 = (325, 60, 41, 2)
Lab

# Load Features

In [204]:
data_dir = os.path.join('npy', 'spec')
print(data_dir)

def add_folds(data_dir):
    num_folds = len(os.listdir(data_dir)) // 2
        
    for k in range(num_folds-3):
        fold_name = 'fold_' + str(k)
        print("\nAdding " + fold_name)
        feature_file = os.path.join(data_dir, fold_name + '_x.npy')
        labels_file = os.path.join(data_dir, fold_name + '_y.npy')
        loaded_features = np.load(feature_file)
        loaded_labels = np.load(labels_file)
        print("New Features: ", loaded_features.shape)

        if k > 0:
            features = np.concatenate((features, loaded_features))
            labels = np.concatenate((labels, loaded_labels))
        else:
            features = loaded_features
            labels = loaded_labels
        
    return features, labels

train_x, train_y = add_folds(data_dir)

# use a fold for train-dev
valid_fold_name = 'fold_7'
feature_file = os.path.join(data_dir, valid_fold_name + '_x.npy')
labels_file = os.path.join(data_dir, valid_fold_name + '_y.npy')
train_dev_x = np.load(feature_file)
train_dev_y = np.load(labels_file) 

# use a fold for dev
valid_fold_name = 'fold_8'
feature_file = os.path.join(data_dir, valid_fold_name + '_x.npy')
labels_file = os.path.join(data_dir, valid_fold_name + '_y.npy')
dev_x = np.load(feature_file)
dev_y = np.load(labels_file) 

# and a fold for testing
test_fold_name = 'fold_9'
feature_file = os.path.join(data_dir, test_fold_name + '_x.npy')
labels_file = os.path.join(data_dir, test_fold_name + '_y.npy')
test_x = np.load(feature_file)
test_y = np.load(labels_file)

# # # One hot encode labels
ohe = preprocessing.OneHotEncoder(sparse=False)
ohe.fit(train_y.reshape(-1, 1))

train_y = ohe.transform(train_y.reshape((-1, 1)))
num_labels = len(train_y[1])
train_dev_y = ohe.transform(train_dev_y.reshape((-1, 1)))
dev_y = ohe.transform(dev_y.reshape((-1, 1)))
test_y = ohe.transform(test_y.reshape((-1, 1)))

# trim

last = min([len(train_x), len(train_dev_x), len(dev_x), len(test_x)])
train_x, train_y = train_x[:last], train_y[:last]
train_dev_x, train_dev_y = train_dev_x[:last], train_dev_y[:last]
dev_x, dev_y = dev_x[:last], dev_y[:last]
test_x, test_y = test_x[:last], test_y[:last]

print(f"\nTraining Set: {train_x.shape}, Labels: {train_y.shape}")
print(f"Train-dev Set: {train_dev_x.shape}, Labels: {train_dev_y.shape}")
print(f"Dev Set: {dev_x.shape}, Labels: {dev_y.shape}")
print(f"Test Set: {test_x.shape}, Labels: {test_y.shape}")



npy/spec

Adding fold_0
New Features:  (297, 60, 41, 2)

Adding fold_1
New Features:  (306, 60, 41, 2)

Adding fold_2
New Features:  (376, 60, 41, 2)

Adding fold_3
New Features:  (357, 60, 41, 2)

Adding fold_4
New Features:  (321, 60, 41, 2)

Adding fold_5
New Features:  (336, 60, 41, 2)

Adding fold_6
New Features:  (341, 60, 41, 2)

Training Set: (308, 60, 41, 2), Labels: (308, 6)
Train-dev Set: (308, 60, 41, 2), Labels: (308, 6)
Dev Set: (308, 60, 41, 2), Labels: (308, 6)
Test Set: (308, 60, 41, 2), Labels: (308, 6)


In [210]:
tf.set_random_seed(0)
np.random.seed(0)

from keras.models import Sequential
import keras.layers as lyers
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from keras.regularizers import l2
from keras.utils import np_utils

frames = 41
tfbands = 60
num_channels = 2
feature_size = bands * frames #60x4
num_labels = 6
input_shape=(bands, frames, num_channels)


def build_model():
    
    model = Sequential()
    # input: 60x41 data frames with 2 channels => (60,41,2) tensors

    # filters of size 3x3 - paper describes using 5x5, but their input data is 128x128
    f_size = 3

    # Layer 1 - 24 filters with a receptive field of (f,f), i.e. W has the
    # shape (24,1,f,f).  This is followed by (4,2) max-pooling over the last
    # two dimensions and a ReLU activation function
    model.add(Convolution2D(24, f_size, f_size, border_mode='same', input_shape=(bands, frames, num_channels)))
    model.add(MaxPooling2D(pool_size=(4, 2)))
    model.add(Activation('relu'))

    # Layer 2 - 48 filters with a receptive field of (f,f), i.e. W has the 
    # shape (48, 24, f, f). Like L1 this is followed by (4,2) max-pooling 
    # and a ReLU activation function.
    model.add(Convolution2D(48, f_size, f_size, border_mode='same'))
    model.add(MaxPooling2D(pool_size=(4, 2)))
    model.add(Activation('relu'))

    # Layer 3 - 48 filters with a receptive field of (f,f), i.e. W has the
    # shape (48, 48, f, f). This is followed by a ReLU but no pooling.
    model.add(Convolution2D(48, f_size, f_size, border_mode='valid'))
    model.add(Activation('relu'))

    # flatten output into a single dimension, let Keras do shape inference
    model.add(Flatten())

    # Layer 4 - a fully connected NN layer of 64 hidden units, L2 penalty of 0.001
    model.add(Dense(64, W_regularizer=l2(0.001)))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))

    # Layer 5 - an output layer with one output unit per class, with L2 penalty, 
    # followed by a softmax activation function
    model.add(Dense(num_labels, W_regularizer=l2(0.001)))
    model.add(Dropout(0.5))
    # softmax on single output will always normalise the value to 1.0, so need sigmoid
    model.add(Activation('sigmoid'))

    # create an optimiser
    # adagrad = Adagrad(lr=0.01, epsilon=1e-08, decay=0.0)
    sgd = SGD(lr=0.001, momentum=0.9, decay=0.0, nesterov=True)

    # compile and fit model, reduce epochs if you want a result faster
    # the validation set is used to identify parameter settings (epoch) that achieves 
    # the highest classification accuracy (note binary rather than categorial crossentropy)
    model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=sgd)
    
    return model


In [211]:
print("Building model...")
model = build_model()
model.summary()

Building model...




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_112 (Conv2D)          (None, 60, 41, 24)        456       
_________________________________________________________________
max_pooling2d_61 (MaxPooling (None, 15, 20, 24)        0         
_________________________________________________________________
activation_87 (Activation)   (None, 15, 20, 24)        0         
_________________________________________________________________
conv2d_113 (Conv2D)          (None, 15, 20, 48)        10416     
_________________________________________________________________
max_pooling2d_62 (MaxPooling (None, 3, 10, 48)         0         
_________________________________________________________________
activation_88 (Activation)   (None, 3, 10, 48)         0         
_________________________________________________________________
conv2d_114 (Conv2D)          (None, 1, 8, 48)          20784     
__________

In [214]:
from sklearn.metrics import roc_auc_score

def evaluate(model):
    y_prob = model.predict_proba(test_x, verbose=0)
    y_pred = model.predict(test_x)
    y_true = np.argmax(test_y, 1)

    # evaluate the model
    score, accuracy = model.evaluate(test_x, test_y, batch_size=32)
    print("Accuracy = {:.2f}".format(accuracy))
    
    return roc, accuracy

In [215]:
print("Training model...")
earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
model.fit(train_x, train_y, validation_data=(dev_x, dev_y), callbacks=[earlystop], batch_size=20, epochs=3)

# now evaluate the trained model against the unseen test data
print("Evaluating model...")
roc, acc = evaluate(model)


Training model...
Train on 308 samples, validate on 308 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Evaluating model...
Accuracy = 0.80
