# Experimento 3: HMM varios locutores

In [1]:
from os import listdir
from os.path import isdir, join
import numpy as np
import matplotlib.pyplot as plt
import python_speech_features
from scipy.io import wavfile

### Load features

In [2]:
feature_sets_file = 'mfcc_sets_BD2.npz'
feature_sets = np.load(feature_sets_file)
feature_sets.files

['x', 'y', 'words']

In [3]:
words = feature_sets['words']
print(words)

['zero' 'one' 'two' 'three' 'four' 'five' 'six' 'seven' 'eight' 'nine'
 'yes' 'no' 'up' 'down' 'left' 'right' 'on' 'off' 'stop' 'go']


In [4]:
x = feature_sets['x']
y = feature_sets['y']
print(x.shape)
print(y.shape)
print(type(x))

(70608, 20, 99)
(70608,)
<class 'numpy.ndarray'>


### Standardization, or mean removal and variance scaling

In [5]:
#print(x_train[0])

In [6]:
#from sklearn import preprocessing
#for i in range(len(x_train)):
    #scaler = preprocessing.StandardScaler().fit(x_train[i])
    #x_train[i] = scaler.transform(x_train[i])

In [7]:
#for i in range(len(x_test)):
    #scaler = preprocessing.StandardScaler().fit(x_test[i])
    #x_test[i] = scaler.transform(x_test[i])
#print(len(x_test))

In [8]:
#print(x_train[0])

In [9]:
#print(y_train)
#print(y_test)

In [10]:
# Split MFFCC coefficients into random train and test subsets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Train files : ",len(x_train))
print("Test files : ", len(x_test))
print("y_train")
print(y_train)
print("y_test")
print(y_test)

Train files :  56486
Test files :  14122
y_train
[14. 12.  9. ... 15.  0.  4.]
y_test
[17. 11.  8. ...  4.  4. 15.]


## HMM

In [11]:
from hmmlearn import hmm

# Class to handle HMM processing(Python-Machine-Learning-Cookbook)
    
class HMMTrainer(object):
    def __init__(self, model_name='GaussianHMM', n_components=10, cov_type='diag', n_iter=100):
        self.model_name = model_name
        self.n_components = n_components
        self.cov_type = cov_type
        self.n_iter = n_iter
        self.models = []

        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components,
                    covariance_type=self.cov_type, n_iter=self.n_iter)
        else:
            raise TypeError('Invalid model type')

    # X is a 2D numpy array where each row is 16D
    def train(self, X, lengths):
        np.seterr(all='ignore')
        self.models.append(self.model.fit(X, lengths))#Feature matrix of individual samples.

    # Run the model on input data
    def get_score(self, input_data, lengths):
        return self.model.score(input_data, lengths)

In [12]:
# Se crea un array con todos los MFCC de cada palabra concatenados, para entrenar un modelo HMM por cada palabra
X = [np.array([]) for i in range(0, len(words))]
lengths = [0 for i in range(0, len(words))]

# Iterate through the y_train
for i in range(len(y_train)):

    # Extract the label
    label = int(y_train[i])
    
    lengths[label] += 1
    
    # Extract MFCC features
    mfcc_features = x_train[i]
            
    # Append to the variable X
    if len(X[label]) == 0:
        X[label] = mfcc_features.T
    else:
        X[label] = np.append(X[label], mfcc_features.T, axis=0)

In [13]:
print(lengths)

[3005, 2791, 2804, 2752, 2726, 2971, 2887, 2913, 2732, 2912, 2955, 2810, 2599, 2864, 2788, 2764, 2824, 2764, 2826, 2799]


#### Train HMM models

In [14]:
hmm_models = []# 1 modelo por cada label (palabra)

for label in range(len(X)):
    word = words[label]
    lengths_in = [99 for i in range(0, lengths[label])]
    print('training word:', word, X[label].shape, lengths[label])

    # Train and save HMM model
    hmm_trainer = HMMTrainer()
    hmm_trainer.train(X[label], lengths_in)
    hmm_models.append((hmm_trainer, label, word))
    hmm_trainer = None
    lengths_in = None

training word: zero (297495, 20) 3005
training word: one (276309, 20) 2791
training word: two (277596, 20) 2804
training word: three (272448, 20) 2752
training word: four (269874, 20) 2726
training word: five (294129, 20) 2971
training word: six (285813, 20) 2887
training word: seven (288387, 20) 2913
training word: eight (270468, 20) 2732
training word: nine (288288, 20) 2912
training word: yes (292545, 20) 2955
training word: no (278190, 20) 2810
training word: up (257301, 20) 2599
training word: down (283536, 20) 2864
training word: left (276012, 20) 2788
training word: right (273636, 20) 2764
training word: on (279576, 20) 2824
training word: off (273636, 20) 2764
training word: stop (279774, 20) 2826
training word: go (277101, 20) 2799


#### Test

In [15]:
y_pred = np.array([])
# Iterate through the y_test
for i in range(len(y_test)):
    

    # Extract the label and view the word
    test_label = int(y_test[i])
    test_word = words[test_label]
    
    # Extract MFCC features
    mfcc_features = x_test[i]

    # Define variables
    max_score = float('-inf')
    output_label = None
    output_word = word
    lengths_in = [99]
    
    # Iterate through all HMM models and pick
    # the one with the highest score
    for item in hmm_models:
        hmm_model, label, word = item
        score = hmm_model.get_score(mfcc_features.T, lengths_in)
        #print(score)
        if score > max_score:
            max_score = score
            output_label = label
            output_word = word
            
    y_pred = np.append(y_pred, output_label)

## Resultados

n_components=4, n_iter=100

In [22]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, y_pred)*100 )
print('Report : ')
print(classification_report(y_test, y_pred, target_names= words))

Confusion Matrix :
[[437   7  20  24   5  13  21  21  10  13  20  30   0  35  17  55   2   0
   10  12]
 [ 28 234   5   0  14  30   7   6   0  28   4  81  10 116  27   8  55  11
    8  29]
 [ 68  11 317  28  13  15  36  13  28   3  35  38   2  36  32  11   2   1
    1  31]
 [ 50   2  22 310   1   6  22   9  81  10  42  15   0  28  11  33   1   1
    1   6]
 [ 17  23   3   3 432   8  10   7   0   0  12  22   4  33  13  19  11  29
    3  28]
 [ 24   6   3   8   8 298   8  20   1  28  16  19  46  97  48  48  17  25
   27   9]
 [  9   0  20  15   1   9 361  31  31   2 187   1   3   8  13   4   0   6
    8   2]
 [ 29   6  16   3   3  33  65 363   2  11  68  16   0  50  31  30   2   2
   21   5]
 [ 20   0   5  83   2   5  43   2 407   3  70  14   1  18  11   3   0   2
    4   4]
 [ 19  35   5  10   2  39   7  24   5 317  13  44  10 124  23  20   6   1
    2  11]
 [ 10   3  13   9   3   6 142  34  29   3 403   3   2  22  22  13   1   5
    6   8]
 [ 57  51  15   2  14  18   7  16   0  29   9 

n_components=6, n_iter=1000

In [26]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, y_pred)*100 )
print('Report : ')
print(classification_report(y_test, y_pred, target_names= words))

Confusion Matrix :
[[488  13  29  14  10   8  22  18  14  20  10  18   9  16  15  16   1   2
    6  23]
 [  4 290   9   0  15  13   3   8   3  32   1  58  57  57  28   7  56   3
   10  47]
 [ 11  36 397  14   7   6  30  18  44  16   0  10  15  10  23  12   2   1
    1  68]
 [ 14  24  40 381   0   2  20   9  85  11  11   6   6   1   7  20   2   1
    0  11]
 [  1  35   6   0 424   8   9   2   0   5   4  18  24   5  17  16   3  30
    2  68]
 [  3  34   5   2   7 306   8  12   3  30   7   8 113  34  59  36  14  25
   24  26]
 [ 14   2  12  10   0  10 486  46  36   1  39   0   8   4  10  15   0   4
    5   9]
 [ 11   9  13   1   2  10  30 496   5  13  19   6   9  23  51  19   4   2
   17  16]
 [ 14   8  16  87   0   4  51   8 414   6  41   5   9   3  13  11   0   2
    1   4]
 [  2  68   5   3   2  23   3  24   9 344   6  36  27  61  38  20  17   1
    8  20]
 [ 18   7   7   9   0   5 107  33  35   6 428   1   8   5  48  10   1   3
    1   5]
 [  6  99  13   3  15   5   3  18   5  35   3 

n_components=10, n_iter=100

In [16]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, y_pred)*100 )
print('Report : ')
print(classification_report(y_test, y_pred, target_names= words))

Confusion Matrix :
[[500   9  43  14   9  15  17  10   5  12  16  25   1  12  14  28   2   3
   12   5]
 [  3 305  25   0  22  33   8   5   2  34   4  63  25  25  37   9  52   2
   10  37]
 [ 18  15 465  17  18  17  20  16  15   9   5  19  11   5  17  13   2   1
    4  34]
 [ 26   4  37 416   0  13  13   5  55  18  11  10   5   1  13  18   3   1
    2   0]
 [  7  28  21   2 421  13  14   4   1   2   1  31   8   4  10   8  12  46
    5  39]
 [  3  12  11   3  10 352  15  14   2  39  25  10  62  28  42  45  14  23
   37   9]
 [ 19   1  21   8   3  18 409  40  36   5  99   3   3   1  13  10   2   1
   17   2]
 [  7   6  22   2   1  20  22 484   1   7  62  16   7  14  48  13   5   2
   10   7]
 [  7   0  17  92   2  18  35   2 404  16  49   4   5   0  16  26   0   0
    4   0]
 [  6  31  19  15   6  41   5  13   8 388  17  39  13  27  36  23  18   1
    3   8]
 [ 18   5  13   2   1  13  86  32  38   9 454   4   5   9  23  14   1   2
    7   1]
 [ 10  44  33   1   9  19   9  14   4  22  12 