# Lab 10: Keyword Recognition

## Edward Passagi (passagi2)

In [1]:
# Library Import and basic function definition
from scipy.io import wavfile

import random

import scipy
import scipy.spatial.distance as dis
import matplotlib.pyplot as plt
import scipy.signal as signal
import numpy as np
import IPython.display as ipd
import librosa

from tqdm.auto import tqdm, trange

from tqdm import tqdm # show progress bar, low overhead

# Print Sound
def sound( x, rate=8000, label=''):
    from IPython.display import display, Audio, HTML
    if label is '':
        display( Audio( x, rate=rate))
    else:
        display( HTML( 
        '<style> table, th, td {border: 0px; }</style> <table><tr><td>' + label + 
        '</td><td>' + Audio( x, rate=rate)._repr_html_()[3:] + '</td></tr></table>'
        ))

### Fetch audio data

In [2]:
# sr = 44100
sr = wavfile.read("./digits_samples/template.wav")[0]

# take L channel
template = np.array(wavfile.read("./digits_samples/template.wav")[1][:,0], dtype=float)
test = np.array(wavfile.read("./digits_samples/test.wav")[1][:,0], dtype=float)

# find MFCC for both sets
templateMFCC = librosa.feature.mfcc(template, sr, n_mfcc = 50)
testMFCC = librosa.feature.mfcc(test, sr, n_mfcc = 50)


  
  """
  


### Parse to each digits

In [3]:
# parse template to 10 MFCC and 10 digits
tempMFList = []
tempDigs = np.array(np.array_split(template,10))

# parse testing to 110 MFCCs and 110 digits
testMFList = []
testDigs = np.array(np.array_split(test,110))

for i in range(10):
    tempMFList.append(librosa.feature.mfcc(tempDigs[i], sr, n_mfcc = 50))
    
for i in range(110):
    testMFList.append(librosa.feature.mfcc(testDigs[i], sr, n_mfcc = 50))


#### test digits is represented in testIndex mod(10)

In [4]:
# sound of some template digits
print("template digits")
for i in range(0,10,2):
    pr = "number: "+str(i)
    sound(tempDigs[i], sr, pr)

# sound of some test digits

print("test digits")
for i in range(90,100,2):
    pr = "number: "+str(i)
    sound(testDigs[i], sr, pr)

template digits


0,1
number: 0,Your browser does not support the audio element.


0,1
number: 2,Your browser does not support the audio element.


0,1
number: 4,Your browser does not support the audio element.


0,1
number: 6,Your browser does not support the audio element.


0,1
number: 8,Your browser does not support the audio element.


test digits


0,1
number: 90,Your browser does not support the audio element.


0,1
number: 92,Your browser does not support the audio element.


0,1
number: 94,Your browser does not support the audio element.


0,1
number: 96,Your browser does not support the audio element.


0,1
number: 98,Your browser does not support the audio element.


## Part 1: Making a Digit Recognizer

In this section we will design a simple spoken digit recognizer, based on Dynamic Time Warping (DTW). In order to make such a system we need to first collect some data, and then design a DTW routine that can compare new inputs with templates for each digit.

To start with make a set of data that will be used here. Make a dozen or so recordings of yourself speaking each of the ten digits (0 to 9). We will use one recording from each digit as the template, and the rest at testing data. In order to not spend too much time collecting the data, record all these utterances in a single (long) sound file. Use your voice activity detector to split that file into the individual spoken digits.

In order to design a digit recognizer we will take a spoken input of a digit and compare it to each digit’s template. By finding which template is the most similar we can classify the input as belonging to that template’s digit. In order to measure the distance between the two sequences we have to use DTW on an appropriate feature space.

Decide which feature to use to represent your speech signals. It can be any feature that we used in the past (e.g. some type of an STFT, MFCCs, etc). When comparing a template with a new input you need to perform the following steps:

1. Compute the distance matrix between all the features of each input. This will be a $M$ by $N$ matrix in which the $(i, j)$ element will represent the distance between the $i$-th frame of the template and the $j$-th frame of the input. We will use the cosine distance which is defined as:

$$D(\mathbf{a},\mathbf{b}) = \frac{\sum a_i b_i}{\sqrt{a_i^2}\sqrt{\sum b_i^2}}$$

2. Once you obtain the distance matrix, you need to compute the cost matrix that encodes the cost of passing through a node given a previously optimal path. We will use the local constraint that to reach node $(i, j)$ you can either come from nodes $(i–1, j–1)$, $(i, j–1)$ or $(i–1, j)$.

3. Starting from the first element of the matrix (1,1), and for each element of the cost matrix you will need to perform the following steps. For node $(i, j)$ you need to examine the nodes from which you can reach it – these will be nodes $(i–1, j–1)$, $(i, j–1)$ or $(i–1, j)$ – and see which one has the lowest cost. Therefore, reaching that node from the optimal path will have the cost of the optimal preceding node plus the distance that corresponds to being at node $(i, j)$. Iterate until you calculate the cost of passing through every node. As you do that, for each node keep track of which of the three preceding nodes was the optimal one.

4. Now you can backtrack and find the optimal path. Start from the final point of the cost matrix and find the node from which you arrived there (it will be the same one that had the lowest cost above). Once you get to that node, repeat this process until you reach the beginning indexes of the two sequences. The path that you took in this process will be the optimal path that aligns the two sequences.

5. The distance between the two sequences will be the cost of being at the final node. Use this to perform the digit classification.

In [5]:
def cost_mat(D):
    C = D.copy()
    # copy right corner
#     C[0,-1] = D[0,-1]
    
    for i in range(1, C.shape[0]):
        for j in range(1, C.shape[1]):
            curr = C[i,j]
            W = C[i, j-1] + curr
            NW = C[i-1,j-1] + curr
            N = C[i-1,j] + curr
            # assign lowest value to C matrix
            C[i,j] = np.nanmin([W,NW,N])
    return C

def D_mat(a,b):
    D = np.zeros((len(a.T), len(b.T)))
    for i, matA in enumerate(a.T):
        for j, matB in enumerate(b.T):
            # get cosine distance between the two frames
            D[i,j] = dis.cosine(matA,matB)
    return D

def classify(inputMFCC, templateMFCC, window = 40):
    retval = np.zeros(len(templateMFCC))
    
    for i, templateFrame in enumerate(templateMFCC):
        D = D_mat(inputMFCC, templateFrame)
#         print(np.sum(D))
        C = cost_mat(D)
        
        # get minimum cost from both edges
        # only consider the last half
        opt = min(min(C[window:,-1]),min(C[-1,window:]))
#         print("min C: {}, min C.T: {}".format(np.argmin(C[:,-1]), np.argmin(C[-1,:])))
#         if i == 0: print(C)
#         print("distance to {}:{}, {}".format(i, opt, C.shape))
        retval[i]=opt
    return np.argmin(retval)


In [6]:
testIndex = 93

predicted = classify(testMFList[testIndex], tempMFList)
# testMFList
# classify_c(tempMFList[2], tempMFList)
print("Actual: {}, Predicted: {}".format(testIndex%10,predicted))
sound(testDigs[testIndex], sr, "Actual digit")
sound(tempDigs[predicted], sr, "Template digit")

Actual: 3, Predicted: 3


0,1
Actual digit,Your browser does not support the audio element.


0,1
Template digit,Your browser does not support the audio element.


In [8]:
# Testing the algorithm on 110 testing sounds
correct = np.zeros(10)
for i in tqdm(range(0,110)):
    guessedval = classify(testMFList[i], tempMFList)
    correct[i%10] = correct[i%10]+1 if guessedval==i%10 else correct[i%10]
#     print("accuracy: {}%".format(str(correct/(i+1)*100)))

  5%|████▍                                                                             | 6/110 [00:12<03:39,  2.11s/it]

KeyboardInterrupt: 

In [None]:
## Data Summary
print("Data Summary:\n")
totalCorrectDigit = int(np.sum(correct))
print("Total Accuracy: {}%, Correct Guesses: {}, False Guesses: {}\n".format(totalCorrectDigit/110*100, totalCorrectDigit, 110-totalCorrectDigit))

for idx, c in enumerate(correct):
    print("Digit {} Accuracy: {}%".format(idx, c/11*100) )

## Part 2. Making a voice-driven dialer

Suppose you just started working for a phone company and the first thing they ask you is to make a hands-free interface for their phones so that people can dial in their friends by voice. During setup, the users speak the name of a contact and then associate it with a number to call. Make a system for which you use the full name of 4-5 of your friends, so that when you speak their name the system recognizes it (and thus could subsequently call their number)

### Fetch Audio Data

In [None]:
# sr = 44100
sr = wavfile.read("./voice_dialler/input.wav")[0]

# take L channel
tempVD = np.array(wavfile.read("./voice_dialler/input.wav")[1][:,0], dtype=float)
testVD = np.array(wavfile.read("./voice_dialler/names.wav")[1][:,0], dtype=float)


In [9]:
print("Input data:")
sound(tempVD, sr, "Input template")
sound(testVD, sr, "Test names")

Input data:


NameError: name 'tempVD' is not defined

#### Sample Contact List:
| Names | Phone Number |
| --- | --- | --- |
| Furkan | 1379 |
| Simon | 5240 |
| Mohamed | 6683 |
| Edward | 7134 |
| Amir | 9523 |

### Parsing names and numbers

In [None]:
recipientNum = 5
phoneDigitsAmt = 4

In [None]:
# parse template to 10 MFCC and 10 digits
tempMFListVD = []
tempWAV = np.array(np.array_split(tempVD,10))

# parse testing to 110 MFCCs and 110 digits
testMFListVD = []
testWAV = np.array(np.array_split(testVD,10))
    
for i in range(10):
    testMFListVD.append(librosa.feature.mfcc(testWAV[i], sr, n_mfcc = 50))


In [None]:
# sound of some template digits
print("template chunks")
for i in range(10):
    pr = "chunk: "+str(i)
    sound(tempWAV[i], sr, pr)

# sound of some test digits
# test digits is represented in testIndex mod(10)
print("test names")
for i in range(10):
    pr = "chunk: "+str(i)
    sound(testWAV[i], sr, pr)

### Group template names and parse each phone number

In [None]:
tempNames = []
phoneNumber = []

for i in range(10):
    if i % 2 == 0: tempNames.append(np.array_split(tempWAV[i],4)[0])
    else: phoneNumber.append(tempWAV[i])

In [None]:
tempNamesMF = []

for i in range(recipientNum):
    tempNamesMF.append(librosa.feature.mfcc(tempNames[i], sr, n_mfcc = 50))

In [None]:
phoneDigs = []
for i in range(recipientNum):
    phoneDigs.append(np.array_split(phoneNumber[i],phoneDigitsAmt))

In [None]:
# classify(inputMFCC, templateMFCC)


phoneNumArr = np.zeros((recipientNum,phoneDigitsAmt))

for i in trange(recipientNum, desc='recipients'):
    for j in tqdm(range(phoneDigitsAmt), desc='digits'):
        # get phone number digits
        curDigMFCC = librosa.feature.mfcc(phoneDigs[i][j], sr, n_mfcc = 50)
        curDigit = classify(curDigMFCC, tempMFList)
        phoneNumArr[i][j]=curDigit

In [None]:
phoneNumStr = []

for i in range(recipientNum):
    curStr = ""
    for j in range(phoneDigitsAmt):
        curStr += str(int(phoneNumArr[i][j]))
    phoneNumStr.append(curStr)

it correctly identifies the number 90% of the time

In [None]:
phoneNumStr

In [None]:
names = ["Furkan", "Simon","Mohamed","Edward","Amir"]

### Testing the feature

In [None]:
# Testing to call "Mohamed", phone number 7134
testIdx = 2
window = 40

title = "input: "+ names[testIdx%5]
sound(testWAV[testIdx%5], sr, title)
guessedNameIdx = classify(testMFListVD[testIdx], tempNamesMF, window)

print("matches with:")
title = "template: "+ names[guessedNameIdx]
sound(tempNames[guessedNameIdx], sr, title)
print("Dialling {}, with phone number: {}".format(names[guessedNameIdx], phoneNumStr[guessedNameIdx]))

In [None]:
# Testing to call "Furkan", phone number 1379
testIdx = 0
window = 40

title = "input: "+ names[testIdx%5]
sound(testWAV[testIdx%5], sr, title)
guessedNameIdx = classify(testMFListVD[testIdx], tempNamesMF, window)

print("matches with:")
title = "template: "+ names[guessedNameIdx]
sound(tempNames[guessedNameIdx], sr, title)
print("Dialling {}, with phone number: {}".format(names[guessedNameIdx], phoneNumStr[guessedNameIdx]))

### Voice Detection accuracy

In [None]:
correctVD = np.zeros(5)

for i in trange(10, desc='Test Set'):
    guessedNameIdx = classify(testMFListVD[i], tempNamesMF, window)
    correctVD[i%5] = correctVD[i%5]+1 if guessedNameIdx==i%5 else correctVD[i%5]

In [None]:
## Data Summary
totalVDCorrect = int(np.sum(correctVD))
print("Data Summary:\n")
print("Accuracy: {}%, Correct Guesses: {}, False Guesses: {}\n".format(totalVDCorrect/10*100, totalVDCorrect, 10-totalVDCorrect))

for idx, c in enumerate(correctVD):
    print("Name {} Accuracy: {}%".format(names[idx%5], c/2*100) )