# SPEECH RECOGNITION USING HMM:
*isolated words from a single speaker

In [1]:
import os
import numpy as np
from scipy.io import wavfile
from hmmlearn import hmm
from python_speech_features import mfcc

# Class to handle HMM processing(Python-Machine-Learning-Cookbook)
class HMMTrainer(object):
    def __init__(self, model_name='GaussianHMM', n_components=4, cov_type='diag', n_iter=1000):
        self.model_name = model_name
        self.n_components = n_components
        self.cov_type = cov_type
        self.n_iter = n_iter
        self.models = []

        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components,
                    covariance_type=self.cov_type, n_iter=self.n_iter)
        else:
            raise TypeError('Invalid model type')

    # X is a 2D numpy array where each row is ??
    def train(self, X):
        np.seterr(all='ignore')
        self.models.append(self.model.fit(X))#Feature matrix of individual samples.

    # Run the model on input data
    def get_score(self, input_data):
        return self.model.score(input_data)

#### Parse and split audio files

In [2]:
input_folder = "data"

# Parse the input audio files
fpaths = []       
labels = []       
spoken = []

for f in os.listdir(input_folder):       
    if f not in ".DS_Store":
        for w in os.listdir("data/" + f):
            fpaths.append("data/" + f + '/' + w)
            labels.append(f)
            if f not in spoken:
                spoken.append(f)
                
print('Words:', spoken)
#print(fpaths)
#print(labels)

# Split audio files into random train and test subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(fpaths, labels, test_size=0.33, random_state=42)
print("Train files : ",len(X_train))
#print(y_train)
print("Test files : ", len(X_test))
#print(y_test)

Words: ['apple', 'banana', 'kiwi', 'lime', 'orange', 'peach', 'pineapple']
Train files :  70
Test files :  35


#### Train HMM models

In [4]:
hmm_models = []

# Iterate through the audio files
for dirname in os.listdir(input_folder):
    print("Training word : " + dirname)
    
    # Initialize variable
    X = np.array([])
    y_words = []
    
    # Iterate through the y_train array 
    for i in range(len(y_train)):
        label = y_train[i]
        if dirname == label:
            filepath = X_train[i]
            #print(filepath)
            # Read the input file
            sampling_freq, audio = wavfile.read(filepath)

            # Extract MFCC features
            mfcc_features = mfcc(audio, sampling_freq, numcep=20, winlen=0.02, winstep=0.01, winfunc=np.hamming)

            # Append to the variable X
            if len(X) == 0:
                X = mfcc_features
            else:
                X = np.append(X, mfcc_features, axis=0)
            # Append the label
            y_words.append(label)
            
    print("features size: ",X.shape)
    print(y_words,len(y_words))
    # Train and save HMM model
    hmm_trainer = HMMTrainer()
    hmm_trainer.train(X)
    hmm_models.append((hmm_trainer, dirname))
    hmm_trainer = None


Training word : apple
features size:  (360, 20)
['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'apple'] 9
Training word : banana
features size:  (519, 20)
['banana', 'banana', 'banana', 'banana', 'banana', 'banana', 'banana', 'banana', 'banana', 'banana'] 10
Training word : kiwi
features size:  (361, 20)
['kiwi', 'kiwi', 'kiwi', 'kiwi', 'kiwi', 'kiwi', 'kiwi', 'kiwi', 'kiwi'] 9
Training word : lime
features size:  (470, 20)
['lime', 'lime', 'lime', 'lime', 'lime', 'lime', 'lime', 'lime', 'lime', 'lime', 'lime'] 11
Training word : orange
features size:  (936, 20)
['orange', 'orange', 'orange', 'orange', 'orange', 'orange', 'orange', 'orange', 'orange', 'orange', 'orange', 'orange'] 12
Training word : peach
features size:  (583, 20)
['peach', 'peach', 'peach', 'peach', 'peach', 'peach', 'peach', 'peach', 'peach'] 9
Training word : pineapple
features size:  (578, 20)
['pineapple', 'pineapple', 'pineapple', 'pineapple', 'pineapple', 'pineapple', 'pineapple', 'pine

#### Test HMM 

In [5]:
for test_file in X_test:
    # Read input file
    sampling_freq, audio = wavfile.read(test_file)

    # Extract MFCC features
    mfcc_features = mfcc(audio, sampling_freq, numcep=20, winlen=0.02, winstep=0.01, winfunc=np.hamming)

    # Define variables
    max_score = float('-inf')
    output_label = None

    # Iterate through all HMM models and pick
    # the one with the highest score
    for item in hmm_models:
        hmm_model, label = item
        score = hmm_model.get_score(mfcc_features)
        #print(score)
        if score > max_score:
            max_score = score
            output_label = label

    # Print the output
    print("Word Test:", test_file)
    print("Word Predicted:", output_label)#, max_score)
    print("================================")

Word Test: data/kiwi/kiwi01.wav
Word Predicted: kiwi
Word Test: data/orange/orange06.wav
Word Predicted: orange
Word Test: data/orange/orange05.wav
Word Predicted: orange
Word Test: data/lime/lime09.wav
Word Predicted: lime
Word Test: data/lime/lime01.wav
Word Predicted: lime
Word Test: data/pineapple/pineapple05.wav
Word Predicted: pineapple
Word Test: data/pineapple/pineapple15.wav
Word Predicted: pineapple
Word Test: data/lime/lime03.wav
Word Predicted: lime
Word Test: data/apple/apple11.wav
Word Predicted: apple
Word Test: data/apple/apple01.wav
Word Predicted: apple
Word Test: data/banana/banana04.wav
Word Predicted: banana
Word Test: data/kiwi/kiwi02.wav
Word Predicted: kiwi
Word Test: data/peach/peach15.wav
Word Predicted: peach
Word Test: data/pineapple/pineapple07.wav
Word Predicted: pineapple
Word Test: data/peach/peach03.wav
Word Predicted: peach
Word Test: data/apple/apple05.wav
Word Predicted: apple
Word Test: data/peach/peach06.wav
Word Predicted: peach
Word Test: data/ki