# SpICE Classifier model (Keras Usage)
Licensed under the Apache License, Version 2.0


Paper: Speech Intelligibility Classifiers from Half-a-Million Utterances

This colab walks through how to download and use the SpICE wav2vec2 based speech intelligibility classifier. This colab walks you through how to use the model on a sample audio file. 

You'll first load the wav2vec2 model from HuggingFace and then use the SpICE classifier head to generate predictions.

In [None]:
#@title Installation
!pip install transformers[tf-cpu]
!pip install -U -q PyDrive

In [None]:
#@title Imports

import os
import numpy as np
import pickle
import soundfile as sf
import librosa
import scipy.io.wavfile as wav
import tensorflow as tf
import tensorflow_hub as hub
import transformers
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer, TFWav2Vec2Model

import IPython
import matplotlib
import matplotlib.pyplot as plt

from google.colab import drive

SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"  # noqa: E501
SPEECH_FILE = "_assets/speech.wav"
EXP_SAMPLE_RATE = 16000

if not os.path.exists(SPEECH_FILE):
    os.makedirs("_assets", exist_ok=True)

In [None]:
!wget "https${SPEECH_URL}"
!mv Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav _assets/speech.wav

In [None]:
#@title download wav2vec2 TF model
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
hf_w2v2_model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

In [None]:
#@title Load Keras model from Hub
spice_w2v2_cls_model = hub.KerasLayer('https://tfhub.dev/google/euphonia_spice/classification/1')

In [None]:
#@title helpers for reading wav
# To read and write WAV soundfiles as np arrays of floats.
def wavread(filename):
  """Read in audio data from a wav file.  Return d, sr."""
  # Read in wav file.
  with open(filename, 'rb') as file_handle:
    samplerate, wave_data = wav.read(file_handle)
  # Normalize short ints to floats in range [-1..1).
  data = np.asfarray(wave_data) / 32768.0
  return data, samplerate

def resample_aud(audio, #: np.ndarray,
                 sample_rate,  # : int,
                 target_sr=16000): #: int) -> np.ndarray:
  """Resample audio to target."""
  # print(audio.shape, sample_rate, target_sr)
  return librosa.core.resample(
      audio, orig_sr=sample_rate, target_sr=target_sr, res_type='kaiser_best')

def read_wav_resample(filename):
  audio, sample_rate = wavread(filename)
  # print(audio.shape)
  # Resample, if necessary.
  if sample_rate != 16000:
    audio = resample_aud(
        audio, sample_rate, target_sr=16000)
  if audio.dtype != 'float32':
    audio = np.array(audio, dtype=np.float32)
  return audio

def samples_to_embedding_hfw2v2(model_input, #: tf.Tensor,
                                hfw2v2, #: Tuple[Any, Any], processor, model
                                sample_rate=16000, #: float,
                                name=None): #, #: Optional[str] = None) -> np.ndarray:
  """Run inference to map audio samples to hf2v2 model embedding."""
  processor, model = hfw2v2

  print('[samples_to_embedding_hfw2v2] %s: Started inference.', name)
  if not tf.is_tensor(model_input):
    model_input = tf.convert_to_tensor(model_input)
  if model_input.dtype != tf.float32:
    raise ValueError(f'hfw2v2 takes floats: {model_input.dtype}')
  # NOTE: Model does normalization in tokenizer, checking input in [-1, 1]
  # is perhaps not necessary.
  if model_input.shape.rank > 1:
    model_input = tf.squeeze(model_input)
  model_input.shape.assert_has_rank(1)

  # Now actually run the inference.
  input_values = processor(
      model_input.numpy(), sampling_rate=sample_rate,
      return_tensors='tf').input_values
  final_ret = model(input_values).last_hidden_state
  final_ret.shape.assert_has_rank(3)
  emb = final_ret.numpy()

  return emb

def get_prediction(wav_file,
                   hfw2v2=(processor, hf_w2v2_model),
                   w2v2_model=spice_w2v2_cls_model):
  audio = read_wav_resample(wav_file)
  w2v2_emb = samples_to_embedding_hfw2v2(audio, hfw2v2=hfw2v2)
  prediction = w2v2_model(w2v2_emb)[0]
  return prediction

In [None]:
#@title Run on sample speech file
# Expected: [samples_to_embedding_hfw2v2] %s: Started inference. None
# <tf.Tensor: shape=(5,), dtype=float32, numpy=
# array([1.0000000e+00, 6.7461668e-21, 1.3921502e-21, 6.2975914e-24,
#        0.0000000e+00], dtype=float32)>
get_prediction(SPEECH_FILE)

## Other checks

In [None]:
out = samples_to_embedding_hfw2v2(np.ones([16000], dtype='float32'),
                                  hfw2v2=(processor, hf_w2v2_model))
print(np.shape(out))
print(np.mean(out), np.min(out), np.max(out))

In [None]:
# Expected: array([[9.9801159e-01, 3.9059367e-05, 1.9493260e-03, 3.9188830e-09,
#        9.7403199e-20]], dtype=float32)>
spice_w2v2_cls_model(out)