# Speech recognition using HMM

In [2]:
from os import listdir
from os.path import isdir, join
import numpy as np
import matplotlib.pyplot as plt
import python_speech_features
from scipy.io import wavfile

### Load features

In [20]:
feature_sets_file = 'mfcc_sets.npz'
feature_sets = np.load(feature_sets_file)
feature_sets.files

['x_train', 'y_train', 'x_test', 'y_test', 'words']

In [21]:
words = feature_sets['words']
print(words)

['yes' 'no' 'up' 'down' 'left' 'right' 'on' 'off' 'stop' 'go' 'forward'
 'backward']


In [22]:
x_train = feature_sets['x_train']
y_train = feature_sets['y_train']
x_test = feature_sets['x_test']
y_test = feature_sets['y_test']
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(25450, 20, 99)
(25450,)
(12536, 20, 99)
(12536,)


### Standardization, or mean removal and variance scaling

In [8]:
print(x_train[0])

[[ 9.98268157e+00  9.86446075e+00  9.66131497e+00 ...  1.34171584e+01
   1.29648415e+01  1.21614665e+01]
 [-9.81641741e+00 -7.17963250e+00 -8.89570469e+00 ... -3.00880893e+01
  -3.00827968e+01 -2.63430335e+01]
 [-4.21179078e+00  2.76909162e+00 -2.63032587e+00 ...  9.40435391e+00
   1.31365694e+01  7.16339859e+00]
 ...
 [ 7.62392838e+00 -1.61575584e+00  1.03481596e+00 ...  2.96046042e+00
   2.65182215e+00 -3.02095466e+00]
 [ 4.56104028e+00  5.87183192e-01  1.09039655e+00 ... -9.90274602e-02
  -4.52956197e+00 -6.40532541e+00]
 [ 2.98161635e-02  8.76717745e-01 -2.82792954e+00 ...  7.07371179e-01
   2.24843616e+00  1.37236158e+00]]


In [9]:
from sklearn import preprocessing
for i in range(len(x_train)):
    scaler = preprocessing.StandardScaler().fit(x_train[i])
    x_train[i] = scaler.transform(x_train[i])

In [10]:
print(x_train[0])

[[ 1.34713475  1.53554902  2.33592641 ...  1.0357664   1.17445545
   1.32814343]
 [-1.15834885 -0.94287151 -1.46875671 ... -2.11980029 -2.78226949
  -2.20435892]
 [-0.4491095   0.50379519 -0.18418724 ...  0.7447056   1.19023983
   0.86960776]
 ...
 [ 1.04864554 -0.1338155   0.56726443 ...  0.27731059  0.22653398
  -0.06473113]
 [ 0.66105134  0.1865189   0.57865993 ...  0.05539672 -0.43354315
  -0.37522205]
 [ 0.08764607  0.22862078 -0.22470125 ...  0.11388724  0.18945674
   0.33832306]]


In [14]:
from sklearn import preprocessing
for i in range(len(x_test)):
    scaler = preprocessing.StandardScaler().fit(x_test[i])
    x_test[i] = scaler.transform(x_test[i])
print(len(x_test))

12536


In [16]:
print(y_train)
print(y_test)

[0. 8. 9. ... 3. 0. 4.]
[8. 2. 1. ... 4. 8. 8.]


## HMM

In [19]:
from hmmlearn import hmm

# Class to handle HMM processing(Python-Machine-Learning-Cookbook)
    
class HMMTrainer(object):
    def __init__(self, model_name='GaussianHMM', n_components=4, cov_type='diag', n_iter=1000):
        self.model_name = model_name
        self.n_components = n_components
        self.cov_type = cov_type
        self.n_iter = n_iter
        self.models = []

        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components,
                    covariance_type=self.cov_type, n_iter=self.n_iter)
        else:
            raise TypeError('Invalid model type')

    # X is a 2D numpy array where each row is 16D
    def train(self, X, lengths):
        np.seterr(all='ignore')
        self.models.append(self.model.fit(X, lengths))#Feature matrix of individual samples.

    # Run the model on input data
    def get_score(self, input_data, lengths):
        return self.model.score(input_data, lengths)

#### Train HMM models

In [23]:
X = [np.array([]) for i in range(0, len(words))]
lengths = [0 for i in range(0, len(words))]

# Iterate through the y_train
for i in range(len(y_train)):

    # Extract the label
    label = int(y_train[i])
    
    lengths[label] += 1
    
    # Extract MFCC features
    mfcc_features = x_train[i]
            
    # Append to the variable X
    if len(X[label]) == 0:
        X[label] = mfcc_features.T
    else:
        X[label] = np.append(X[label], mfcc_features.T, axis=0)

In [25]:
print(lengths)

[2457, 2359, 2156, 2402, 2417, 2285, 2361, 2290, 2393, 2334, 965, 1031]


In [26]:
hmm_models = []# 1 modelo por cada label (palabra)

for label in range(len(X)):
    word = words[label]
    lengths_in = [99 for i in range(0, lengths[label])]
    print('training word:', word, X[label].shape, lengths[label])

    # Train and save HMM model
    hmm_trainer = HMMTrainer()
    hmm_trainer.train(X[label], lengths_in)
    hmm_models.append((hmm_trainer, label, word))
    hmm_trainer = None
    lengths_in = None

training word: yes (243243, 20) 2457
training word: no (233541, 20) 2359
training word: up (213444, 20) 2156
training word: down (237798, 20) 2402
training word: left (239283, 20) 2417
training word: right (226215, 20) 2285
training word: on (233739, 20) 2361
training word: off (226710, 20) 2290
training word: stop (236907, 20) 2393
training word: go (231066, 20) 2334
training word: forward (95535, 20) 965
training word: backward (102069, 20) 1031


#### Test

In [27]:
y_pred = np.array([])
# Iterate through the y_test
for i in range(len(y_test)):
    

    # Extract the label and view the word
    test_label = int(y_test[i])
    test_word = words[test_label]
    
    # Extract MFCC features
    mfcc_features = x_test[i]

    # Define variables
    max_score = float('-inf')
    output_label = None
    output_word = word
    lengths_in = [99]
    
    # Iterate through all HMM models and pick
    # the one with the highest score
    for item in hmm_models:
        hmm_model, label, word = item
        score = hmm_model.get_score(mfcc_features.T, lengths_in)
        #print(score)
        if score > max_score:
            max_score = score
            output_label = label
            output_word = word
            
    y_pred = np.append(y_pred, output_label)

In [29]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, y_pred)*100 )
print('Report : ')
print(classification_report(y_test, y_pred, target_names= words))

Confusion Matrix :
[[870   9   2  53 108  95   8   3  64  10   3  10]
 [ 18 451  28 177  63  59  94  13  11 176  27  69]
 [  1  69 567 123  66  13  40  65  87  31   6  45]
 [  7 176  40 440  82  95 131   8  31  55   5 108]
 [ 84  43  49  99 523  92  26  29  56  23   8  53]
 [ 54  60  13 149 122 519  14   6  25  51  23 127]
 [  2 131  77 177  30  20 506  61  19  29  27  32]
 [  9  29 106  66  42   6  73 645  74  20  55  12]
 [ 56  10  57  64  54  35  23  85 739  16  20  11]
 [ 11 243  45 129  59  69  50  34  21 297 102  84]
 [  2  19   5  13  10  10  15  11   2  35 333  32]
 [  5  36   9  47  30  52  12   3   4  21  22 286]]
Accuracy Score : 49.266113592852584
Report : 
              precision    recall  f1-score   support

         yes       0.78      0.70      0.74      1235
          no       0.35      0.38      0.37      1186
          up       0.57      0.51      0.54      1113
        down       0.29      0.37      0.32      1178
        left       0.44      0.48      0.46      10