In [103]:
import numpy as np
import tensorflow as tf
import sounddevice as sd
import soundfile as sf

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
import librosa

In [104]:
SAMPLE_RATE = 16000 # Hz
FRAME_LENGTH_MS = 25 # ms
FRAME_HOP_MS = 10 # 10 ms
FRAMES_BEFORE = 23 
FRAMES_AFTER = 8
N_MELS = 40
frame_length_samples = int(SAMPLE_RATE * FRAME_LENGTH_MS / 1000)
frame_hop_samples = int(SAMPLE_RATE * FRAME_HOP_MS / 1000)

keywords = ["_silence_", "_unknown_", "down", "go", "left", "no", "off", "on", "right", "stop", "up", "yes"]


In [105]:
def compute_mfcc(audio_file):
    """Return shape: [n_mfcc, time]"""
    
    audio_file, index = librosa.effects.trim(audio_file,
                                             hop_length=frame_hop_samples,
                                             frame_length=frame_length_samples)

    if not isinstance(audio_file, np.ndarray):
        audio_file = audio_file.numpy()
    y = librosa.feature.melspectrogram(y=audio_file, sr=SAMPLE_RATE, 
                                        hop_length=frame_hop_samples,
                                        n_fft=frame_length_samples,
                                        n_mels=128)

    # Convert to log scale (dB). We'll use the peak power (max) as reference.
    log_S = librosa.power_to_db(y)
    log_S_normalized = librosa.util.normalize(log_S)

    mfccs = librosa.feature.mfcc(
        S=log_S_normalized,
        # y=audio_file,
        sr=SAMPLE_RATE,
        n_mfcc=N_MELS,
        hop_length=frame_hop_samples,
        n_fft=frame_length_samples
    )
    
    delta_mfcc = librosa.feature.delta(mfccs)

    return delta_mfcc


def normalize_input_size(data, center_crop=False, target_size=101):
# this function is designed for a batch of mfccs which should be 3D
    if len(data.shape) == 3:
        batch_size, n_mels, N = data.shape

        if N == target_size:
            return data

        if N < target_size:
            tot_pads = target_size - N
            left_pads = int(np.ceil(tot_pads / 2))
            right_pads = int(np.floor(tot_pads / 2))
            return np.pad(data, [(0, 0), (0, 0), (left_pads, right_pads)], mode='constant')

        if center_crop:
            from_ = int((N / 2) - (target_size / 2))
        else:
            from_ = np.random.randint(0, np.floor(N - target_size))
        to_ = from_ + target_size

        return data[:, :, from_:to_]

    else:
        raise ValueError("Unsupported shape: {}".format(data.shape))


In [106]:
model_loaded = tf.keras.models.load_model('CnnModel.h5')

In [120]:
samplerate = 16000  # Hertz
duration = 1  # seconds
filename = 'output.wav'

mydata = sd.rec(int(samplerate * duration), samplerate=samplerate,
                channels=1, blocking=True)
sf.write(filename, mydata, samplerate)

In [121]:
audio, sr = librosa.load(filename, sr=SAMPLE_RATE)
input = np.expand_dims(normalize_input_size(compute_mfcc(np.expand_dims(audio, 0))).transpose(0,2,1), -1)
output = model_loaded(input)
keywords[tf.argmax(output, axis=-1).numpy()[0]]

'stop'