Copyright 2020 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
!git clone https://github.com/google-research/google-research.git

In [None]:
import sys
sys.path.append('./google-research')

# Example with speech feature visualization

In [2]:
import numpy as np
from matplotlib import pylab as plt
import scipy.io.wavfile as wav
import scipy as scipy

from kws_streaming.layers import modes
from kws_streaming.layers import speech_features
from kws_streaming.layers import test_utils
from kws_streaming.layers.compat import tf
from kws_streaming.models import model_params

In [3]:
tf.compat.v1.enable_eager_execution()
tf.executing_eagerly()

True

In [4]:
def waveread_as_pcm16(filename):
  """Read in audio data from a wav file.  Return d, sr."""
  with tf.io.gfile.GFile(filename, 'rb') as file_handle:
    sr, wave_data = wav.read(file_handle)
  # Read in wav file.
  return wave_data, sr

def wavread_as_float(filename, target_sample_rate=16000):
  """Read in audio data from a wav file.  Return d, sr."""
  wave_data, sr = waveread_as_pcm16(filename)
  desired_length = int(round(float(len(wave_data)) / sr * target_sample_rate))
  wave_data = scipy.signal.resample(wave_data, desired_length)

  # Normalize short ints to floats in range [-1..1).
  data = np.array(wave_data, np.float32) / 32768.0
  return data, target_sample_rate

In [6]:
def speech_feature_model(input_size, p):
  speech_params = speech_features.SpeechFeatures.get_params(p)
  mode = modes.Modes.TRAINING
  inputs = tf.keras.layers.Input(shape=(input_size,), batch_size=p.batch_size, dtype=tf.float32)
  outputs = speech_features.SpeechFeatures(speech_params, mode, p.batch_size)(inputs)
  model = tf.keras.models.Model(inputs, outputs)
  return model

In [189]:
params = model_params.Params()
params.window_size_ms = 25.0
params.window_stride_ms = 10.0
params.preemph = 0.97
params.use_spec_augment = 0
params.use_spec_cutout = 0
params.use_tf_fft = 0
params.time_shift_ms = 0.0
params.sp_time_shift_ms = 0.0
params.resample = 0.0
params.sp_resample = 0.0
params.train = 0
params.batch_size = 1
params.mode = modes.Modes.NON_STREAM_INFERENCE
params.data_stride = 1
params.data_frame_padding = None
params.fft_magnitude_squared = False

In [190]:
frame_size = int(
    round(params.sample_rate * params.window_size_ms / 1000.0))
frame_step = int(
    round(params.sample_rate * params.window_stride_ms / 1000.0))

In [None]:
# wave_filename = "test_speech.wav"
# waveform_data, sr = wavread_as_float(wave_filename)

samplerate = 16000
data_size = 51200
test_utils.set_seed(1)
frequency = 1000
waveform_data = np.cos(2.0*np.pi*frequency*np.arange(data_size)/samplerate) * 2 + np.random.rand(data_size) * 0.4

In [192]:
signal = np.expand_dims(waveform_data, axis=0)
data_size = signal.shape[1]

## Speech feature extractor: Data framing + Preemphasis + Windowing + DFT + Mel + log (no DCT: dct_num_features=0)


In [230]:
params.mel_num_bins = 80
params.dct_num_features = 0  # no DCT
params.feature_type = 'mfcc_tf'
params.use_tf_fft = False
params.mel_non_zero_only = False
params.mel_upper_edge_hertz = 4000

model1 = speech_feature_model(data_size, params)

In [None]:
model1.layers[1].mag_rdft_mel.real_dft_tensor.shape

In [None]:
mel_table1 = model1.layers[1].mag_rdft_mel.mel_weight_matrix.numpy()
mel_table1.shape

In [None]:
out1 = model1.predict(signal)
plt.figure(figsize=(20, 5))
plt.imshow(np.transpose(out1[0]))

In [None]:
plt.figure(figsize=(20, 5))
for i in range(mel_table1.shape[1]):
  plt.plot(mel_table1[:, i])

In [198]:
# It makes sense to set it True only if params.mel_upper_edge_hertz is much smaller than 8000
# then DFT will be computed only for frequencies which are non zero in mel spectrum - it saves computation
params.mel_non_zero_only = True

model2 = speech_feature_model(data_size, params)

In [None]:
model2.layers[1].mag_rdft_mel.real_dft_tensor.shape

In [None]:
mel_table2 = model2.layers[1].mag_rdft_mel.mel_weight_matrix.numpy()
mel_table2.shape

In [201]:
out2 = model2.predict(signal)

In [None]:
plt.figure(figsize=(20, 5))
plt.imshow(np.transpose(out2[0]))

In [None]:
plt.figure(figsize=(20, 5))
for i in range(mel_table2.shape[1]):
  plt.plot(mel_table2[:, i])

In [None]:
np.allclose(out1, out2, atol=1e-06)

## Compare mfcc_tf with mfcc_op

In [223]:
params.mel_num_bins = 80
params.dct_num_features = 20
params.feature_type = 'mfcc_tf'
params.use_tf_fft = False
params.mel_non_zero_only = False
params.fft_magnitude_squared = False
params.mel_upper_edge_hertz = 4000
params.preemph = 0.0  # mfcc_op des not have preemphasis

model3 = speech_feature_model(data_size, params)

In [None]:
out3 = model3.predict(signal)
plt.figure(figsize=(20, 5))
plt.imshow(np.transpose(out3[0]))

In [228]:
params.feature_type = 'mfcc_op'
# it will call two functions:
# 1 audio_spectrogram computes hann windowing,
#   then FFT - magnitude has to be squared
#   because next function - mfcc computes sqrt (it assumes magnitude is squared)
# 2 mfcc - compute mel spectrum from the squared-magnitude FFT input by taking the
# square root, then multiply it with mel table then apply log and compute DCT

params.fft_magnitude_squared = True
model4 = speech_feature_model(data_size, params)

In [None]:
out4 = model4.predict(signal)
plt.figure(figsize=(20, 5))
plt.imshow(np.transpose(out4[0]))

In [None]:
# Features extracted with 'mfcc_op' are numerically different from 'mfcc_tf'
np.allclose(out3, out4, atol=1e-6)