# Final Exercise

Use the `chatbot`, `template_matching` and `speech_processing` notebooks to create a voice activated chatbot that answers yes/know questions.

Solution:

- Use the `Bot` class and the `yes_no_processor` to get a ready made chatbot
- Create a new `speech_source` for your `Bot` instance
- Use the `AudioManager` from `speech_processing` to record audio
- Extract MFCCs for the audio clips corresponding to yes and no
- Use the `Trellis` idea from `template_matching` to recognize yes/no 

In [14]:
from collections import defaultdict

import importer
from chatbot import StatementProcessor, get_yes_no_processor, get_keyboard_source, Bot
from template_matching import Trellis
from speech_processing import AudioManager

from python_speech_features import mfcc
from python_speech_features.base import delta
import numpy as np

In [11]:
# Install python_speech_features that contains a routine to extract mfcc
!pip3 install -U python_speech_features

Collecting python_speech_features
Installing collected packages: python-speech-features
Successfully installed python-speech-features-0.5
[33mYou are using pip version 8.1.1, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [302]:
# Creating templates

samples_per_word = 8
words = ["one", "five"]

audioManager = AudioManager(chunk=128, rate=8000)
audioManager.build_silence_model(duration=5, factor=1.1)

Please stay quiet. Measuring ambient noise...
* recording
* done recording


In [303]:
def feature_extractor(samples):
    samples = np.concatenate(samples)
    samples = samples/np.abs(samples).max()
    samples = samples - samples.mean()
    mfcc_features = mfcc(samples, samplerate=8000, winlen=0.032, winstep=0.016, numcep=13, appendEnergy=True, preemph=0)
    #features = np.vstack((mfcc_features, delta(mfcc_features, 1)))
    features = mfcc_features
    return features

templates = dict.fromkeys(words)
for k, v in templates.items():
    templates[k] = []

for word in words:
    for ii in range(samples_per_word):
        print("(", (ii+1), "/", samples_per_word, ") Say", word)
        ok = False
        while not ok:
            samples = audioManager.record(2, filter_silence=True)
            audioManager.play(samples)
            ok = input("Is this ok?").lower()
            if ok == "yes" or ok=="y":
                ok = True
            else:
                ok = False
        features = feature_extractor(samples)
        templates[word].append(features)

( 1 / 8 ) Say one
Press Enter to start recording...
* recording
* done recording
before 123
after 114
Is this ok?y
( 2 / 8 ) Say one
Press Enter to start recording...
* recording
* done recording
before 125
after 112
Is this ok?y
( 3 / 8 ) Say one
Press Enter to start recording...
* recording
* done recording
before 115
after 102
Is this ok?y
( 4 / 8 ) Say one
Press Enter to start recording...
* recording
* done recording
before 124
after 109
Is this ok?y
( 5 / 8 ) Say one
Press Enter to start recording...
* recording
* done recording
before 120
after 113
Is this ok?
Press Enter to start recording...
* recording
* done recording
before 119
after 113
Is this ok?y
( 6 / 8 ) Say one
Press Enter to start recording...
* recording
* done recording
before 113
after 107
Is this ok?y
( 7 / 8 ) Say one
Press Enter to start recording...
* recording
* done recording
before 100
after 93
Is this ok?y
( 8 / 8 ) Say one
Press Enter to start recording...
* recording
* done recording
before 122
after 12

In [345]:
def scoring_func(x, y):
    #print(x.shape, y.shape)
    return np.abs(x - y).sum()

def do_word_detection(audioManager, templates):
    trellis = Trellis(match_weight=1.0, delete_weight=1.0, add_weight=1.0, scoring_func=scoring_func)
    samples = audioManager.record(2, filter_silence=True, wait_for_kb=True)
    test_features = feature_extractor(samples)
    lowest_word = None
    lowest_score = 1e9
    for word, template_features in templates.items():
        word_score = 1e9
        for template in template_features:
            score, _ = trellis.match(template, test_features, normalize_score=True)
            #if score < word_score:
            #    word_score = score
            word_score += score
        word_score = word_score / len(template_features)
        if score < lowest_score:
            lowest_word = word
            lowest_score = score
        print(word, score)
    print("Detected", lowest_word)

In [361]:
do_word_detection(audioManager, templates)
#templates["yes"][0]

Press Enter to start recording...
* recording
* done recording
before 119
after 117
five 63.5607267903
one 94.1193006599
Detected five
