In [None]:
"""
Please run notebook locally (if you have all the dependencies and a GPU). 
Technically you can run this notebook on Google Colab but you need to set up microphone for Colab.
 
Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Set up microphone for Colab
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg portaudio19-dev
!pip install unidecode
!pip install pyaudio

# ## Install NeMo
BRANCH = 'r1.0.0rc1'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

In [15]:
## Install TorchAudio
!pip install torchaudio>=0.6.0 -f https://download.pytorch.org/whl/torch_stable.html


This notebook demonstrates offline and online (from a microphone's stream in NeMo) speech commands recognition 

It is **not a recommended** way to do inference in production workflows. If you are interested in 
production-level inference using NeMo ASR models, please sign-up to Jarvis early access program: https://developer.nvidia.com/nvidia-jarvis

The notebook requires PyAudio library to get a signal from an audio device.
For Ubuntu, please run the following commands to install it:
```
sudo apt-get install -y portaudio19-dev
pip install pyaudio
```

This notebook requires the `torchaudio` library to be installed for MatchboxNet. Please follow the instructions available at the [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.

If you would like to install the latest version, please run the following command to install it:

```
conda install -c pytorch torchaudio
```

In [1]:
import numpy as np
import pyaudio as pa
import os, time
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
%matplotlib inline

import nemo
import nemo.collections.asr as nemo_asr

[NeMo W 2021-03-25 02:57:25 experimental:28] Module <class 'nemo.collections.asr.models.clustering_diarizer.ClusteringDiarizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2021-03-25 02:57:26 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [2]:
# sample rate, Hz
SAMPLE_RATE = 16000

## Restore the model from NGC

In [3]:
for model in nemo_asr.models.EncDecClassificationModel.list_available_models():
    print(f'{model.pretrained_model_name}: {model.description}')

MatchboxNet-3x1x64-v1: MatchboxNet model trained on Google Speech Commands dataset (v1, 30 classes) which obtains 97.32% accuracy on test set.
MatchboxNet-3x2x64-v1: MatchboxNet model trained on Google Speech Commands dataset (v1, 30 classes) which obtains 97.68% accuracy on test set.
MatchboxNet-3x1x64-v2: MatchboxNet model trained on Google Speech Commands dataset (v2, 35 classes) which obtains 97.12% accuracy on test set.
MatchboxNet-3x1x64-v2: MatchboxNet model trained on Google Speech Commands dataset (v2, 30 classes) which obtains 97.29% accuracy on test set.
MatchboxNet-3x1x64-v2-subset-task: MatchboxNet model trained on Google Speech Commands dataset (v2, 10+2 classes) which obtains 98.2% accuracy on test set.
MatchboxNet-3x2x64-v2-subset-task: MatchboxNet model trained on Google Speech Commands dataset (v2, 10+2 classes) which obtains 98.4% accuracy on test set.
MatchboxNet-VAD-3x2: Voice Activity Detection MatchboxNet model trained on google speech command (v2) and freesound 

In [3]:
mbn_model = nemo_asr.models.EncDecClassificationModel.from_pretrained("MatchboxNet-3x1x64-v2")

[NeMo I 2021-03-25 02:57:27 cloud:56] Found existing object /home/cjbayron/.cache/torch/NeMo/NeMo_1.0.0rc2/MatchboxNet-3x1x64-v2/92ca210d37c6546a81285e054524be15/MatchboxNet-3x1x64-v2.nemo.
[NeMo I 2021-03-25 02:57:27 cloud:62] Re-using file from: /home/cjbayron/.cache/torch/NeMo/NeMo_1.0.0rc2/MatchboxNet-3x1x64-v2/92ca210d37c6546a81285e054524be15/MatchboxNet-3x1x64-v2.nemo
[NeMo I 2021-03-25 02:57:27 common:615] Instantiating model from pre-trained checkpoint


[NeMo W 2021-03-25 02:57:27 modelPT:133] Please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    labels:
    - visual
    - wow
    - learn
    - backward
    - dog
    - two
    - left
    - happy
    - nine
    - go
    - up
    - bed
    - stop
    - one
    - zero
    - tree
    - seven
    - 'on'
    - four
    - bird
    - right
    - eight
    - 'no'
    - six
    - forward
    - house
    - marvin
    - sheila
    - five
    - 'off'
    - three
    - down
    - cat
    - follow
    - 'yes'
    batch_size: 128
    shuffle: true
    augmentor:
      shift:
        prob: 1.0
        min_shift_ms: -5.0
        max_shift_ms: 5.0
      white_noise:
        prob: 1.0
        min_level: -90
        max_level: -46
    
[NeMo W 2021-03-25 02:57:27 modelPT:140] Please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_dat

[NeMo I 2021-03-25 02:57:28 modelPT:376] Model EncDecClassificationModel was successfully restored from /home/cjbayron/.cache/torch/NeMo/NeMo_1.0.0rc2/MatchboxNet-3x1x64-v2/92ca210d37c6546a81285e054524be15/MatchboxNet-3x1x64-v2.nemo.


In [7]:
!du -h /home/cjbayron/.cache/torch/NeMo/NeMo_1.0.0rc2/MatchboxNet-3x1x64-v2/92ca210d37c6546a81285e054524be15/*

316K	/home/cjbayron/.cache/torch/NeMo/NeMo_1.0.0rc2/MatchboxNet-3x1x64-v2/92ca210d37c6546a81285e054524be15/MatchboxNet-3x1x64-v2.nemo


Since speech commands model MatchBoxNet doesn't consider non-speech scenario, 
here we use a Voice Activity Detection (VAD) model to help reduce false alarm for background noise/silence. When there is speech activity detected, the speech command inference will be activated. 


**Please note the VAD model is not perfect for various microphone input and you might need to finetune on your input and play with different parameters.**

In [4]:
vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained('MatchboxNet-VAD-3x2')

[NeMo I 2021-03-25 02:57:28 cloud:56] Found existing object /home/cjbayron/.cache/torch/NeMo/NeMo_1.0.0rc2/MatchboxNet_VAD_3x2/1375f3813383105a24acc75428ec51c4/MatchboxNet_VAD_3x2.nemo.
[NeMo I 2021-03-25 02:57:28 cloud:62] Re-using file from: /home/cjbayron/.cache/torch/NeMo/NeMo_1.0.0rc2/MatchboxNet_VAD_3x2/1375f3813383105a24acc75428ec51c4/MatchboxNet_VAD_3x2.nemo
[NeMo I 2021-03-25 02:57:28 common:615] Instantiating model from pre-trained checkpoint


[NeMo W 2021-03-25 02:57:28 modelPT:133] Please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /home/fjia/code/manifest64/balanced_background_training_manifest.json,/home/fjia/code/manifest64/balanced_speech_training_manifest.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 128
    num_workers: 20
    shuffle: true
    augmentor:
      shift:
        prob: 0.8
        min_shift_ms: -5.0
        max_shift_ms: 5.0
      white_noise:
        prob: 0.8
        min_level: -90
        max_level: -46
    
[NeMo W 2021-03-25 02:57:28 modelPT:140] Please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /home/fjia/code/manifest64/balanced_background_validation_manifest.json,/home/

[NeMo I 2021-03-25 02:57:28 modelPT:376] Model EncDecClassificationModel was successfully restored from /home/cjbayron/.cache/torch/NeMo/NeMo_1.0.0rc2/MatchboxNet_VAD_3x2/1375f3813383105a24acc75428ec51c4/MatchboxNet_VAD_3x2.nemo.


## Observing the config of the model

In [5]:
from omegaconf import OmegaConf
import copy

In [6]:
# Preserve a copy of the full config
vad_cfg = copy.deepcopy(vad_model._cfg)
mbn_cfg = copy.deepcopy(mbn_model._cfg)
print(OmegaConf.to_yaml(mbn_cfg))

sample_rate: 16000
timesteps: 128
repeat: 1
dropout: 0.0
kernel_size_factor: 1.0
labels:
- visual
- wow
- learn
- backward
- dog
- two
- left
- happy
- nine
- go
- up
- bed
- stop
- one
- zero
- tree
- seven
- 'on'
- four
- bird
- right
- eight
- 'no'
- six
- forward
- house
- marvin
- sheila
- five
- 'off'
- three
- down
- cat
- follow
- 'yes'
train_ds:
  manifest_filepath: null
  sample_rate: 16000
  labels:
  - visual
  - wow
  - learn
  - backward
  - dog
  - two
  - left
  - happy
  - nine
  - go
  - up
  - bed
  - stop
  - one
  - zero
  - tree
  - seven
  - 'on'
  - four
  - bird
  - right
  - eight
  - 'no'
  - six
  - forward
  - house
  - marvin
  - sheila
  - five
  - 'off'
  - three
  - down
  - cat
  - follow
  - 'yes'
  batch_size: 128
  shuffle: true
  augmentor:
    shift:
      prob: 1.0
      min_shift_ms: -5.0
      max_shift_ms: 5.0
    white_noise:
      prob: 1.0
      min_level: -90
      max_level: -46
validation_ds:
  manifest_filepath: null
  sample_rate: 1600

## What classes can this model recognize?

Before we begin inference on the actual audio stream, let's look at what are the classes this model was trained to recognize.  

**MatchBoxNet model is not designed to recognize words out of vocabulary (OOV).**

In [7]:
labels = mbn_cfg.labels
for i in range(len(labels)):
    print('%-10s' % (labels[i]), end=' ')

visual     wow        learn      backward   dog        two        left       happy      nine       go         up         bed        stop       one        zero       tree       seven      on         four       bird       right      eight      no         six        forward    house      marvin     sheila     five       off        three      down       cat        follow     yes        

## Setup preprocessor with these settings

In [8]:
# Set model to inference mode
mbn_model.eval();
vad_model.eval();

## Setting up data for Streaming Inference

In [9]:
from nemo.core.classes import IterableDataset
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
import torch
from torch.utils.data import DataLoader

In [10]:
# simple data layer to pass audio signal
class AudioDataLayer(IterableDataset):
    @property
    def output_types(self):
        return {
            'audio_signal': NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
        }

    def __init__(self, sample_rate):
        super().__init__()
        self._sample_rate = sample_rate
        self.output = True
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if not self.output:
            raise StopIteration
        self.output = False
        return torch.as_tensor(self.signal, dtype=torch.float32), \
               torch.as_tensor(self.signal_shape, dtype=torch.int64)
        
    def set_signal(self, signal):
        self.signal = signal.astype(np.float32)/32768.
        self.signal_shape = self.signal.size
        self.output = True

    def __len__(self):
        return 1

In [11]:
data_layer = AudioDataLayer(sample_rate=mbn_cfg.train_ds.sample_rate)
data_loader = DataLoader(data_layer, batch_size=1, collate_fn=data_layer.collate_fn)

## inference method for audio signal (single instance)

In [12]:
def infer_signal(model, signal):
    data_layer.set_signal(signal)
    batch = next(iter(data_loader))
    audio_signal, audio_signal_len = batch
    audio_signal, audio_signal_len = audio_signal.to(model.device), audio_signal_len.to(model.device)
    logits = model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)
    return logits

we don't include postprocessing techniques here. 

In [13]:
# class for streaming frame-based ASR
# 1) use reset() method to reset FrameASR's state
# 2) call transcribe(frame) to do ASR on
#    contiguous signal's frames
class FrameASR:
    
    def __init__(self, model_definition,
                 frame_len=2, frame_overlap=2.5, 
                 offset=0):
        '''
        Args:
          frame_len (seconds): Frame's duration
          frame_overlap (seconds): Duration of overlaps before and after current frame.
          offset: Number of symbols to drop for smooth streaming.
        '''
        self.task = model_definition['task']
        self.vocab = list(model_definition['labels'])
        
        self.sr = model_definition['sample_rate']
        self.frame_len = frame_len
        self.n_frame_len = int(frame_len * self.sr)
        self.frame_overlap = frame_overlap
        self.n_frame_overlap = int(frame_overlap * self.sr)
        timestep_duration = model_definition['AudioToMFCCPreprocessor']['window_stride']
        for block in model_definition['JasperEncoder']['jasper']:
            timestep_duration *= block['stride'][0] ** block['repeat']
        self.buffer = np.zeros(shape=2*self.n_frame_overlap + self.n_frame_len,
                               dtype=np.float32)
        self.offset = offset
        self.reset()
        
    @torch.no_grad()
    def _decode(self, frame, offset=0):
        assert len(frame)==self.n_frame_len
        self.buffer[:-self.n_frame_len] = self.buffer[self.n_frame_len:]
        self.buffer[-self.n_frame_len:] = frame

        if self.task == 'mbn':
            logits = infer_signal(mbn_model, self.buffer).to('cpu').numpy()[0]
            decoded = self._mbn_greedy_decoder(logits, self.vocab)
            
        elif self.task == 'vad':
            logits = infer_signal(vad_model, self.buffer).to('cpu').numpy()[0]
            decoded = self._vad_greedy_decoder(logits, self.vocab)
           
        else:
            raise("Task should either be of mbn or vad!")
            
        return decoded[:len(decoded)-offset]
    
    def transcribe(self, frame=None,merge=False):
        if frame is None:
            frame = np.zeros(shape=self.n_frame_len, dtype=np.float32)
        if len(frame) < self.n_frame_len:
            frame = np.pad(frame, [0, self.n_frame_len - len(frame)], 'constant')
        unmerged = self._decode(frame, self.offset)
        return unmerged
        
    
    def reset(self):
        '''
        Reset frame_history and decoder's state
        '''
        self.buffer=np.zeros(shape=self.buffer.shape, dtype=np.float32)
        self.mbn_s = []
        self.vad_s = []
        
    @staticmethod
    def _mbn_greedy_decoder(logits, vocab):
        mbn_s = []
        if logits.shape[0]:
            class_idx = np.argmax(logits)
            class_label = vocab[class_idx]
            mbn_s.append(class_label)         
        return mbn_s
    
    
    @staticmethod
    def _vad_greedy_decoder(logits, vocab):
        vad_s = []
        if logits.shape[0]:
            probs = torch.softmax(torch.as_tensor(logits), dim=-1)
            probas, preds = torch.max(probs, dim=-1)
            vad_s = [preds.item(), str(vocab[preds]), probs[0].item(), probs[1].item(), str(logits)]
        return vad_s


# Streaming Inference

## offline inference
Here we show an example of offline streaming inference. you can use your file or download the provided demo audio file. 


Streaming inference depends on a few factors, such as the frame length (STEP) and buffer size (WINDOW SIZE). Experiment with a few values to see their effects in the below cells.

In [14]:
STEP = 0.25
WINDOW_SIZE = 1.28 # input segment length for NN we used for training

In [16]:
import wave

def offline_inference(wave_file, STEP = 0.25, WINDOW_SIZE = 0.31):
    """
    Arg:
        wav_file: wave file to be performed inference on.
        STEP: infer every STEP seconds 
        WINDOW_SIZE : lenght of audio to be sent to NN.
    """
    
    FRAME_LEN = STEP 
    CHANNELS = 1 # number of audio channels (expect mono signal)
    RATE = SAMPLE_RATE # sample rate, 16000 Hz
   
    CHUNK_SIZE = int(FRAME_LEN * SAMPLE_RATE)
    
    mbn = FrameASR(model_definition = {
                       'task': 'mbn',
                       'sample_rate': SAMPLE_RATE,
                       'AudioToMFCCPreprocessor': mbn_cfg.preprocessor,
                       'JasperEncoder': mbn_cfg.encoder,
                       'labels': mbn_cfg.labels
                   },
                   frame_len=FRAME_LEN, frame_overlap = (WINDOW_SIZE - FRAME_LEN)/2,
                   offset=0)

    wf = wave.open(wave_file, 'rb')
    data = wf.readframes(CHUNK_SIZE)

    while len(data) > 0:

        data = wf.readframes(CHUNK_SIZE)
        signal = np.frombuffer(data, dtype=np.int16)
        mbn_result = mbn.transcribe(signal)
        
        if len(mbn_result):
            print(mbn_result)
            
    mbn.reset()

In [17]:
demo_wave = 'SpeechCommands_demo.wav'
if not os.path.exists(demo_wave):
    !wget "https://dldata-public.s3.us-east-2.amazonaws.com/SpeechCommands_demo.wav"

In [18]:
wave_file = demo_wave

CHANNELS = 1
audio, sample_rate = librosa.load(wave_file, sr=SAMPLE_RATE)
dur = librosa.get_duration(audio)
print(dur)

1.2331065759637188


In [19]:
ipd.Audio(audio, rate=sample_rate)

In [20]:
# Ground-truth is Yes No
offline_inference(wave_file, STEP, WINDOW_SIZE)

['yes']
['yes']
['yes']
['yes']
['no']
['no']
['no']


## Online inference through microphone

Please note MatchBoxNet and VAD model are not perfect for various microphone input and you might need to finetune on your input and play with different parameter. \
**We also recommend to use a headphone.**

In [21]:
vad_threshold = 0.8 

STEP = 0.1 
WINDOW_SIZE = 0.15
mbn_WINDOW_SIZE = 1

CHANNELS = 1 
RATE = SAMPLE_RATE
FRAME_LEN = STEP # use step of vad inference as frame len

CHUNK_SIZE = int(STEP * RATE)
vad = FrameASR(model_definition = {
                   'task': 'vad',
                   'sample_rate': SAMPLE_RATE,
                   'AudioToMFCCPreprocessor': vad_cfg.preprocessor,
                   'JasperEncoder': vad_cfg.encoder,
                   'labels': vad_cfg.labels
               },
               frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2, 
               offset=0)

mbn = FrameASR(model_definition = {
                       'task': 'mbn',
                       'sample_rate': SAMPLE_RATE,
                       'AudioToMFCCPreprocessor': mbn_cfg.preprocessor,
                       'JasperEncoder': mbn_cfg.encoder,
                       'labels': mbn_cfg.labels
                   },
                   frame_len=FRAME_LEN, frame_overlap = (mbn_WINDOW_SIZE-FRAME_LEN)/2,
                   offset=0)

In [23]:
vad.reset()
mbn.reset()

import sys

# Setup input device
p = pa.PyAudio()
print('Available audio input devices:')
input_devices = []
for i in range(p.get_device_count()):
    dev = p.get_device_info_by_index(i)
    if dev.get('maxInputChannels'):
        input_devices.append(i)
        print(i, dev.get('name'))

if len(input_devices):
    dev_idx = -2
    while dev_idx not in input_devices:
        print('Please type input device ID:')
        dev_idx = int(input())

    
    def callback(in_data, frame_count, time_info, status):
        """
        callback function for streaming audio and performing inference
        """
        signal = np.frombuffer(in_data, dtype=np.int16)
        vad_result = vad.transcribe(signal) 
        mbn_result = mbn.transcribe(signal) 
        
        if len(vad_result):
            # if speech prob is higher than threshold, we decide it contains speech utterance 
            # and activate MatchBoxNet 
            if vad_result[3] >= vad_threshold: 
                print(mbn_result, end='') # print mbn result when speech present
            else:
                print("no-speech")
        return (in_data, pa.paContinue)

    # streaming
    stream = p.open(format=pa.paInt16,
                    channels=CHANNELS,
                    rate=SAMPLE_RATE,
                    input=True,
                    input_device_index=dev_idx,
                    stream_callback=callback,
                    frames_per_buffer=CHUNK_SIZE)

    
    print('Listening...')
    stream.start_stream()
    
    # Interrupt kernel and then speak for a few more words to exit the pyaudio loop !
    try:
        while stream.is_active():
            time.sleep(0.1)
            sys.stdout.flush()
    finally:        
        stream.stop_stream()
        stream.close()
        p.terminate()
        print()
        print("PyAudio stopped")
    
else:
    print('ERROR: No audio input device found.')

Available audio input devices:
0 HDA Intel PCH: Generic Analog (hw:0,0)
6 sysdefault
12 pulse
14 default
Please type input device ID:
14
Listening...
['up']['off']['left']['left']['left']['sheila']['two']['two']['two']['no']['no']['no']['no']['no']['no']['no']['no']['no']['no']['no']['no']['no']['no']['no']['no']['no']['no']['go']['no']['go']['go']['two']no-speech
['eight']['eight']['nine']['no']['left']['left']['no']['no']['no']['no']['no']['no']['left']['left']['left']['left']['left']['left']['left']['left']['left']['left']['left']['left']['left']['left']['cat']['cat']['cat']['cat']['cat']['cat']['cat']['cat']['cat']['cat']['cat']['happy']['house']['happy']['happy']['happy']['happy']['happy']['eight']['happy']['eight']['tree']['tree']['tree']['tree']['tree']['tree']['tree']['tree']['bed']['bed']['tree']['tree']['tree']['tree']['tree']['cat']['tree']['cat']['cat']['marvin']['happy']['cat']['cat']['happy']['eight']['happy']['eight']['eight']['eight']['eight']['eight']['eight']['eight']

KeyboardInterrupt: 