## This code is for Training and Evaluation of Custom Multilingual KWS
Audio File should be in the format of **1-sec wav files**.


Datasets needs:
1. Background noise from Google Speech Command v2
2. Trained Multilingual Embedding Features (ref:https://github.com/harvard-edge/multilingual_kws) 
3. Target keyword 1-sec dataset for **training and testing**
4. Unknow_file as non-Target audio file for **training**
5. Unknow_file as non-Target audio file for **testing**

_training KWS code based on https://github.com/harvard-edge/multilingual_kws_

In [1]:
pwd_path = !pwd
print(pwd_path)

['/Users/nasim/Documents/Active_2023/MLEng_bootcampt/Spotify/code/multilingual_kws_v3']


In [2]:
import os
pbar = sorted(os.listdir(pwd_path[0]+ '/content/target_kw/')) # for viewing progress
print("unknow words:", pbar[1:])

unknow words: ['.git', 'README.md', 'amelia', 'android', 'athena', 'computer', 'heychatterbox', 'heycomputer', 'heysavant', 'marvin', 'recording', 'sheila']


In [3]:
# Run  this
#!pip install ipywidgets widgetsnbextension pandas-profiling
!jupyter nbextension enable --py widgetsnbextension


Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [4]:
word_list = ['amelia', 'athena', 'computer', 'heycomputer', 'heysavant','sheila', 'heychatterbox', 'marvin']
KEYWORD = str(word_list[1]) # modify this for each keyword 
KEYWORD

'athena'

In [26]:
keyword_dir = pwd_path[0]+ '/content/target_kw/'+ KEYWORD
base_model_dir = pwd_path[0]+ "/content/multilingual_context_73_0.8011"
background_noise_dir = "./content/speech_commands_v0.02/_background_noise_/"
unknown_words_dir = pwd_path[0]+ '/content/unknown_files/'

file_name = f"/savedmodel_{KEYWORD}_5shot"
model_save_path = pwd_path[0]+file_name

# to gather non-target file names for testing
non_target_examples = []
for word in os.listdir(f"./content/nontarget_mswc_microset_wav/en/clips"):
    if word == KEYWORD:
        continue
    non_target_examples.extend(Path(f"./content/nontarget_mswc_microset_wav/en/clips/{word}").glob("*.wav"))
    
print("n non_target_examples:", len(non_target_examples))

n non_target_examples: 620


#### 1. import all libraries


In [6]:

import sys

from func.embedding import transfer_learning, input_data

import tensorflow as tf
import numpy as np
import IPython
from pathlib import Path
import matplotlib.pyplot as plt
import os
import subprocess #Backport of the subprocess module from Python 3.2/3.3 for use on 2.x.
import csv
from tqdm.notebook import tqdm

# for emdedding:
#!pip install -q umap-learn
import umap # Uniform Manifold Approximation and Projection
import pandas as pd
import sklearn.preprocessing
import seaborn as sns

from sklearn.model_selection import train_test_split
from pathlib import Path
import torchaudio
import torch
import glob


In [7]:
def train(
    keyword: str,
    samples_dir: os.PathLike,
    embedding: os.PathLike,
    unknown_words: os.PathLike,
    background_noise: os.PathLike,
    output: os.PathLike,
    num_epochs: int = 4,
    num_batches: int = 1,
    primary_learning_rate: float = 0.001,
    batch_size: int = 64,
    unknown_percentage: float = 50.0,
    base_model_output: str = "dense_2",
):
    """Fine-tune few-shot model from embedding representation. The embedding
    representation and unknown words can be downloaded from
    https://github.com/harvard-edge/multilingual_kws/releases
    The background noise directory can be downloaded from:
    http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz

    Args:
      keyword: target keyword
      samples_dir: directory of 1-second 16KHz target .wav samples
      embedding: path to embedding representation
      unknown_words: path to unknown words directory
      background_noise: path to Google Speech Commands background noise directory
      output: modelname for saving the model (specified as a path)
      num_epochs: number of finetuning epochs
      num_batches: number of finetuning batches
      primary_learning_rate: finetuning LR
      batch_size: batch size
      unknown_percentage: percentage of samples to draw from unknown_words
      base_model_output: layer to use for embedding representation
    """

    assert (
        Path(background_noise).name == "_background_noise_"
    ), f"only tested with GSC _background_noise_ directory, please provide a path {background_noise}"

    for d in [samples_dir, embedding, unknown_words, background_noise]:
        assert os.path.isdir(d), f"directory {d} not found"

    if os.path.exists(output):
        print(f"Warning: overwriting {output}")

    samples = glob.glob(samples_dir + os.path.sep + "*.wav")
    assert len(samples) > 0, "no sample .wavs found"
    """for s in samples:
        cmd = f"soxi {s}"
        res = subprocess.check_output(shlex.split(cmd))
        out = res.decode("utf8")
        checks = ["75 CDDA sectors", "16000 samples", "00:00:01.00"]

        if not all([c in out for c in checks]):
            raise ValueError(
                f"{s} appears to not be a 16KHz 1-second wav file according to soxi \n{out}"
            )"""

    #print(f"{len(samples)} training samples found:\n" + "\n".join(samples))
    print(f"{len(samples)} training samples found:\n") 

    uftxt = "unknown_files.txt"
    unknown_words = Path(unknown_words)
    assert os.path.isfile(unknown_words / uftxt), f"{unknown_words/uftxt} not found"
    unknown_files = []
    with open(unknown_words / uftxt, "r") as fh:
        for w in fh.read().splitlines():
            unknown_files.append(str(unknown_words / w))

    
    #==========================================
    # In the first step we will split the data in training and remaining dataset
    # because we only want 5 shot for training:
    n_shot = 5
    train_percent = np.ceil((n_shot*100)/len(samples))
    print("train_percent:",train_percent)
    train_samples, X_rem= train_test_split(samples, train_size=train_percent/100, random_state=42)

    # Now since we want the valid and test size to be equal (10% each of overall data). 
    # we have to define valid_size=0.5 (that is 50% of remaining data)
    test_size = 0.5
    dev_samples, test_samples = train_test_split(X_rem, test_size=0.5, random_state=42)
    
    #rng = np.random.RandomState(42)
    #five_samples = rng.choice(train_samples, 5, replace=False).tolist()
    five_samples = train_samples[:5]
    
    print("size of validation and test sets:", len(dev_samples), len(test_samples))
    #==========================================
    
    print("Training model")
    print("background_noise:", background_noise)
    model_settings = input_data.standard_microspeech_model_settings(3) 
    name, model, details = transfer_learning.transfer_learn(
        target=keyword,
        train_files=five_samples,
        val_files=dev_samples,
        unknown_files=unknown_files,
        num_epochs=num_epochs,
        num_batches=num_batches,
        batch_size=batch_size,
        primary_lr=primary_learning_rate,
        backprop_into_embedding=False,
        embedding_lr=0,
        model_settings=model_settings,
        base_model_path=embedding,
        base_model_output=base_model_output,
        UNKNOWN_PERCENTAGE=unknown_percentage,
        bg_datadir=background_noise,
        csvlog_dest=None,
    )

    print(f"saving model to {output}")
    model.save(output)
    return model, five_samples, dev_samples, test_samples


#if __name__ == "__main__":
#    fire.Fire(dict(inference=inference, train=train))
#    train(keyword: str,samples_dir: os.PathLike,embedding: os.PathLike,
#          unknown_words: os.PathLike,
#          background_noise: os.PathLike,
#          output: os.PathLike)



In [8]:
model_fw_kws, five_samples, dev_samples, test_samples = train(keyword= KEYWORD,
                     samples_dir= keyword_dir,
                     embedding= base_model_dir,
                     unknown_words= unknown_words_dir,
                     background_noise= background_noise_dir,
                     output= model_save_path)


482 training samples found:

train_percent: 2.0
size of validation and test sets: 236 237
Training model
background_noise: ./content/speech_commands_v0.02/_background_noise_/




shape_train_init: 5
shape_valid_init: 236
Epoch 1/4


2023-03-08 15:38:24.521006: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/4
Epoch 3/4
Epoch 4/4
saving model to /Users/nasim/Documents/Active_2023/MLEng_bootcampt/Spotify/code/multilingual_kws_v3/savedmodel_athena_5shot


2023-03-08 15:39:04.879261: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /Users/nasim/Documents/Active_2023/MLEng_bootcampt/Spotify/code/multilingual_kws_v3/savedmodel_athena_5shot/assets


INFO:tensorflow:Assets written to: /Users/nasim/Documents/Active_2023/MLEng_bootcampt/Spotify/code/multilingual_kws_v3/savedmodel_athena_5shot/assets


#### 7. Evaluate the model
 - Audio assessment
 - Accuracy on test set
 - FRR and FAR

(class 0: silence/background noise, class 1: unknown keyword, class 2: target)

In [14]:
# using using TensorFlow Lite Micro's speech preprocessing frontend for spectrograms
settings = input_data.standard_microspeech_model_settings(label_count=1)

In [15]:
# Test the trained FS-KWS model on test sets
test_spectrograms = np.array([input_data.file2spec(settings, f) for f in test_samples])

# fetch softmax predictions from the finetuned model:
predictions = model_fw_kws.predict(test_spectrograms)
categorical_predictions_target = np.argmax(predictions, axis=1)

# which predictions match the target class? 
accuracy_target = categorical_predictions_target[categorical_predictions_target == 2].shape[0] / predictions.shape[0]
print(f"Test accuracy on testset: {accuracy_target:0.2f}")

Test accuracy on testset: 0.96


In [29]:
# verifying the keyword spotting model correctly categorizes non-target words as "unknown" by
# running predictions on their spectrograms.
rng = np.random.RandomState(42)
n_file  =  len(test_samples) # equal to number of target-test samples
non_target_examples = rng.choice(non_target_examples, n_file, replace=False).tolist()    
print("Number of non-target examples", len(non_target_examples))
non_target_spectrograms = np.array([input_data.file2spec(settings, str(f)) for f in non_target_examples])

# fetch softmax predictions from the finetuned model:
predictions = model_fw_kws.predict(non_target_spectrograms)
categorical_predictions_nontarget = np.argmax(predictions, axis=1)

# which predictions match the non-target class? 
accuracy_nontarget = categorical_predictions_nontarget[categorical_predictions_nontarget == 1].shape[0] / predictions.shape[0]
print(f"Estimated accuracy on non-target samples: {accuracy_nontarget:0.2f}")



Number of non-target examples 237
Estimated accuracy on non-target samples: 0.97


#### False Rejection Rate(FRR) and False Acceptance Rate(FAR)

In [30]:
frr_val = categorical_predictions_target[categorical_predictions_target != 2].shape[0] / predictions.shape[0]
far_val = categorical_predictions_nontarget[categorical_predictions_nontarget == 2].shape[0] / predictions.shape[0]


print("FRR: {} %".format( np.floor(100000*frr_val)/1000))
print("FAR: {} %".format( np.floor(10000*far_val)/100))

FRR: 3.797 %
FAR: 2.95 %


### Audio Listening

In [31]:
def listen_file(filepath):
    IPython.display.display(IPython.display.Audio(filename=filepath, rate="16000"))

In [32]:
# listening to five random correctly-classified examples from the test set:
sample_idxs = np.random.choice(np.flatnonzero(categorical_predictions_target == 2), 5, replace=False)
for i in sample_idxs:
    listen_file(test_samples[i])



In [33]:
# and 5 incorrectly-classified examples. These may be a mix of 
# false negatives (true instances of the keyword which were classified incorrectly) and 
# anomalous samples (due to errors in forced alignment or the original crowdsourced data).
print(accuracy_target)
if accuracy_target<1:
    sample_idxs = np.random.choice(np.flatnonzero(categorical_predictions_target != 2), 5, replace=False)

    for i in sample_idxs:
        listen_file(test_samples[i])

0.9620253164556962
