In [1]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
from librosa.feature import mfcc
from scipy.io import wavfile
from hmmlearn import hmm
import numpy as np
import os
import warnings
import scipy.stats as sp

In [2]:
warnings.filterwarnings("ignore")

fpaths = []
labels = []
spoken = []
features = []

for f in os.listdir('audio'):
    for w in os.listdir('audio/' + f):
        fpaths.append('audio/' + f + '/' + w)
        labels.append(f)
        if f not in spoken:
            spoken.append(f)
print('Words spoken:', spoken)

Words spoken: ['lime', 'apple', 'kiwi', 'pineapple', 'orange', 'banana', 'peach']


In [3]:
features = []
for n, file in enumerate(fpaths):
    d, sample_rate = librosa.load(file, res_type='kaiser_fast')
    data = np.zeros((6,38))
    mfccs = mfcc(d, sr=sample_rate, n_mfcc= 6)
    data[:,0:mfccs.shape[1]] = mfccs
    features.append(data)

In [4]:
c = list(zip(features, labels))
np.random.shuffle(c)
features, labels = zip(*c)
m_trainingsetfeatures = features[0:84]
m_trainingsetlabels = labels[0:84]
m_testingsetfeatures = features[84:105]
m_testingsetlabels = labels[84:105]

In [5]:
gmmhmmindexdict = {}
index = 0
for word in spoken:
    gmmhmmindexdict[word] = index
    index = index +1

#Parameters needed to train GMMHMM
m_num_of_HMMStates = 3  # number of states
m_num_of_mixtures = 2  # number of mixtures for each hidden state
m_covarianceType = 'diag'  # covariance type
m_n_iter = 10  # number of iterations
m_bakisLevel = 2

In [6]:
def initByBakis(inumstates, ibakisLevel):
    startprobPrior = np.zeros(inumstates)
    startprobPrior[0: ibakisLevel - 1] = 1/float((ibakisLevel - 1))
    transmatPrior = getTransmatPrior(inumstates, ibakisLevel)
    return startprobPrior, transmatPrior


def getTransmatPrior(inumstates, ibakisLevel):
    transmatPrior = (1 / float(ibakisLevel)) * np.eye(inumstates)

    for i in range(inumstates - (ibakisLevel - 1)):
        for j in range(ibakisLevel - 1):
            transmatPrior[i, i + j + 1] = 1. / ibakisLevel

    for i in range(inumstates - ibakisLevel + 1, inumstates):
        for j in range(inumstates - i - j):
            transmatPrior[i, i + j] = 1. / (inumstates - i)

    return transmatPrior

In [7]:
m_startprobPrior ,m_transmatPrior = initByBakis(m_num_of_HMMStates,m_bakisLevel)

In [8]:
class SpeechModel:
    def __init__(self,Class,label):
        self.traindata = np.zeros((0,6))
        self.Class = Class
        self.label = label
        self.traindata = None
        self.model  = hmm.GMMHMM(n_components = m_num_of_HMMStates, n_mix = m_num_of_mixtures, \
                           transmat_prior = m_transmatPrior, startprob_prior = m_startprobPrior, \
                                        covariance_type = m_covarianceType, n_iter = m_n_iter)


In [9]:
speechmodels = [None] * 7


for key in gmmhmmindexdict:
    speechmodels[gmmhmmindexdict[key]] = SpeechModel(gmmhmmindexdict[key],key)

for i in range(0,len(m_trainingsetfeatures)):
     for j in range(0,len(speechmodels)):
        if int(speechmodels[j].Class) == int(gmmhmmindexdict[m_trainingsetlabels[i]]):
            if speechmodels[j].traindata is not None:
                speechmodels[j].traindata = np.concatenate((speechmodels[j].traindata,
                                                        m_trainingsetfeatures[i]))
            else:
                speechmodels[j].traindata =  m_trainingsetfeatures[i]
                
            #speechmodels[j].traindata = m_trainingsetfeatures[i]

In [10]:
for speechmodel in speechmodels:
    print(speechmodel)
    speechmodel.model.fit(speechmodel.traindata)

<__main__.SpeechModel object at 0x7ff76efffeb8>


ValueError: 

In [15]:
m_PredictionlabelList = []

for i in range(0,len(m_testingsetfeatures)):
    scores = []
    for speechmodel in speechmodels:
         scores.append(speechmodel.model.score(m_testingsetfeatures[i]))
    id  = scores.index(max(scores))
    m_PredictionlabelList.append(speechmodels[id].Class)
    print(str(np.round(scores, 3)) + " " + str(max(np.round(scores, 3))) +" "+":"+ speechmodels[id].label)

accuracy = 0.0
count = 0


print("")
print("Prediction for Testing DataSet:")

for i in range(0,len(m_testingsetlabels)):
    print( "Label"+str(i+1)+":"+m_testingsetlabels[i])
    if gmmhmmindexdict[m_testingsetlabels[i]] == m_PredictionlabelList[i]:
       count = count+1

accuracy = 100.0*count/float(len(m_testingsetlabels))

print("")
print("accuracy ="+str(accuracy))
print("")

ValueError: The shape of X  is not compatible with self

In [11]:
import numpy as np
import librosa
from matplotlib import pyplot as plt
from scipy.io import wavfile
import scipy
import os

In [13]:
fpaths = []
labels = []
spoken = []
for f in os.listdir('audio'):
    for w in os.listdir('audio/' + f):
        fpaths.append('audio/' + f + '/' + w)
        labels.append(f)
        if f not in spoken:
            spoken.append(f)
print('Words spoken:', spoken)
all_labels = np.zeros(len(labels))
for n, l in enumerate(set(labels)):
    all_labels[np.array([i for i, _ in enumerate(labels) if _ == l])] = n

Words spoken: ['lime', 'apple', 'kiwi', 'pineapple', 'orange', 'banana', 'peach']


In [14]:
n_mfcc = 40
max_frames = 40
data = np.zeros((len(fpaths), n_mfcc, max_frames))
maxsize = -1
for n, file in enumerate(fpaths):
    X, sample_rate = librosa.load(file, res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=n_mfcc)
    data[n,:,0:mfccs.shape[1]] = mfccs

In [17]:
from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(all_labels, test_size=0.2)
for n,i in enumerate(data):
    data[n] /= (data[n].sum(axis=0)+1)

for train_index, test_index in sss:
    X_train, X_test = data[train_index, ...], data[test_index, ...]
    y_train, y_test = all_labels[train_index], all_labels[test_index]
print('Size of training matrix:', X_train.shape)
print('Size of testing matrix:', X_test.shape)

Size of training matrix: (84, 40, 40)
Size of testing matrix: (21, 40, 40)


In [18]:
from gmmhmm import gmmhmm
ys = set(all_labels)
ms = [gmmhmm(6) for y in ys]
_ = [m.fit(X_train[y_train == y, :, :]) for m, y in zip(ms, ys)]
ps = [m.transform(X_test) for m in ms]
res = np.vstack(ps)
predicted_labels = np.argmax(res, axis=0)
missed = (predicted_labels != y_test)
print('Test accuracy: %.2f percent' % (100 * (1 - np.mean(missed))))

Test accuracy: 85.71 percent
