In [12]:
import numpy as np
import librosa
from matplotlib import pyplot as plt
from scipy.io import wavfile
import scipy
import os

In [13]:
fpaths = []
labels = []
spoken = []
for f in os.listdir('audio'):
    for w in os.listdir('audio/' + f):
        fpaths.append('audio/' + f + '/' + w)
        labels.append(f)
        if f not in spoken:
            spoken.append(f)
print('Words spoken:', spoken)
all_labels = np.zeros(len(labels))
for n, l in enumerate(set(labels)):
    all_labels[np.array([i for i, _ in enumerate(labels) if _ == l])] = n

Words spoken: ['lime', 'apple', 'kiwi', 'pineapple', 'orange', 'banana', 'peach']


In [14]:
n_mfcc = 6
max_frames = 38
data = np.zeros((len(fpaths), n_mfcc, max_frames))
maxsize = -1
for n, file in enumerate(fpaths):
    X, sample_rate = librosa.load(file, res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=n_mfcc)
    data[n,:,0:mfccs.shape[1]] = mfccs

In [15]:
from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(all_labels, test_size=0.2)
for n,i in enumerate(data):
    data[n] /= (data[n].sum(axis=0)+1)

for train_index, test_index in sss:
    X_train, X_test = data[train_index, ...], data[test_index, ...]
    y_train, y_test = all_labels[train_index], all_labels[test_index]
print('Size of training matrix:', X_train.shape)
print('Size of testing matrix:', X_test.shape)

Size of training matrix: (84, 6, 38)
Size of testing matrix: (21, 6, 38)


In [16]:
from gmmhmm import gmmhmm
ys = set(all_labels)
ms = [gmmhmm(6) for y in ys]
_ = [m.fit(X_train[y_train == y, :, :]) for m, y in zip(ms, ys)]
ps = [m.transform(X_test) for m in ms]
res = np.vstack(ps)
predicted_labels = np.argmax(res, axis=0)
missed = (predicted_labels != y_test)
print('Test accuracy: %.2f percent' % (100 * (1 - np.mean(missed))))

Test accuracy: 66.67 percent
