In [None]:
import os
from common import CV_DATA_DIR

WWD_PATH = os.path.join(CV_DATA_DIR, "playground", "audio", "wwd")

In [None]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

import tensorflow as tf

print(f"TF Version: {tf.__version__}")
print(f"TF Devices: {[d.device_type for d in tf.config.list_physical_devices()]}")

In [None]:
import matplotlib.pyplot as plt
from scipy.io import wavfile
from pydub import AudioSegment
import numpy as np
import IPython

%matplotlib inline

# Parameters

In [None]:
SAMPLE_LEN = 10000
Tx = 5511
Ty = 1375
n_freq = 101

# Samples

In [None]:
def load_raw_audio(path):
    backgrounds = []
    positives = []
    negatives = []

    dir = os.path.join(path, "backgrounds")
    for filename in os.listdir(dir):
        if filename.endswith("wav"):
            background = AudioSegment.from_wav(os.path.join(dir, filename))
            backgrounds.append(background)
    dir = os.path.join(path, "positives")
    for filename in os.listdir(dir):
        if filename.endswith("wav"):
            positive = AudioSegment.from_wav(os.path.join(dir, filename))
            positives.append(positive)
    dir = os.path.join(path, "negatives")
    for filename in os.listdir(dir):
        if filename.endswith("wav"):
            negative = AudioSegment.from_wav(os.path.join(dir, filename))
            negatives.append(negative)

    return positives, negatives, backgrounds

In [None]:
# Load RAW audio files
positives, negatives, backgrounds = load_raw_audio(WWD_PATH)

In [None]:
# The number of backgrounds samples
BACKGROUNDS_NUM = len(backgrounds)
# The number of positive samples
POSITIVES_NUM = len(positives)
# The number of negative samples
NEGATIVES_NUM = len(negatives)

In [None]:
print(f"Background lengths (total {len(backgrounds)}): \n...{[len(b) for b in backgrounds]}")
print(f"Positive lengths (total {len(positives)}): \n...{[len(a) for a in positives]}")
print(f"Negative lengths (total {len(negatives)}): \n...{[len(n) for n in negatives]}")

# Spectrogram

* Audio recording is a long list of numbers measuring the little air pressure changes detected by the microphone
* Audio sample rate of 44100 Hz means 44100 numbers of pressure value per second
* Spectrogram tells how much different frequencies are present in an audio clip at any moment in time
* Spectrogram is computed by sliding a window over the raw audio signal and calculating the most active frequencies in each window

In [None]:
def get_wav_info(wav_file):
    rate, data = wavfile.read(wav_file)
    return rate, data

In [None]:
def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)

In [None]:
def get_spectrogram(wav_file, nfft=200, fs=8000, noverlap=120):
    """
    Compute spectrogram from WAV file.
    :param wav_file: the path of wav file
    :param nfft: the length of each window segment
    :param fs: the amount of sampling frequencies
    :param noverlap: the overlap between window segments
    :return: the periodic spectrogram
    """
    _, data = get_wav_info(wav_file)
    nchannels = data.ndim
    if nchannels == 2:
        data = data[:,0]
    pxx, freqs, bins, im = plt.specgram(data, NFFT=nfft, Fs=fs, noverlap=noverlap)
    return pxx

In [None]:
EXAMPLE1 = os.path.join(WWD_PATH, "examples", "example_train.wav")

# Compute spectrogram of audio file
Pxx = get_spectrogram(EXAMPLE1)

The color in spectrogram show the degree to which different frequencies are present
 * green - denotes more active
 * blue - denotes less active frequencies

In [None]:
assert Pxx.shape[0] == n_freq, \
    "Invalid number of frequencies"
assert Pxx.shape[1] == Tx, \
    "Invalid number of spectrogram time steps"

# Sampling

* Recording audio clips is slow and ineffective
* Easier to record a lot of audio clips with positive and negative words on top of background noise
* To generate audio clip we need:
    * Pick a random background audio clip
    * Randomly insert 0-n audio clips of positive word
    * Randomly insert 0-m audio clips of negative words
* Generating audio clips provides easy way to generate labels $y^{\langle t \rangle}$

In [None]:
def get_random_time_segment(segment_ms):
    """
    Gets a random time segment of duration segment_ms in audio clip.
    :param segment_ms: the duration of the audio clip in ms
    :return: a tuple of (segment_start, segment_end) in ms
    """
    segment_start = np.random.randint(low=0, high=SAMPLE_LEN-segment_ms)
    segment_end = segment_start + segment_ms - 1
    return segment_start, segment_end

In [None]:
def is_overlapping(segment_time, previous_segments):
    """
    Checks if the time of a segment overlaps with the times of existing segments.
    :param segment_time: a tuple of (segment_start, segment_end) for the new segment
    :param previous_segments: a list of tuples of (segment_start, segment_end) for the existing segments
    :return: True if the time segment overlaps with any of the existing segments, False otherwise
    """
    segment_start, segment_end = segment_time
    overlap = False
    for previous_start, previous_end in previous_segments:
        if segment_start <= previous_end and segment_end >= previous_start:
            overlap = True
            break
    return overlap

In [None]:
def insert_audio_clip(background, audio_clip, previous_segments, attempts=5):
    """
    Insert a new audio segment over the background noise at a random time step, ensuring that the
    audio segment does not overlap with existing segments.
    :param background: the background audio recording.
    :param audio_clip: the audio clip to be inserted/overlaid.
    :param previous_segments: times when audio segments have already been placed
    :param attempts: the number of attempts to find where to insert audio clip
    :return: the updated background audio
    """
    segment_ms = len(audio_clip)
    segment_time = get_random_time_segment(segment_ms)
    retry_cnt = attempts

    while is_overlapping(segment_time, previous_segments) and retry_cnt >= 0:
        segment_time = get_random_time_segment(segment_ms)
        retry_cnt -= 1

    if not is_overlapping(segment_time, previous_segments):
        previous_segments.append(segment_time)
        new_background = background.overlay(audio_clip, position=segment_time[0])
    else:
        new_background = background
        segment_time = (SAMPLE_LEN, SAMPLE_LEN)

    return new_background, segment_time

In [None]:
def insert_ones(y, segment_end_ms, count=50):
    """
    Update the label vector y. The labels of the `count` output steps strictly after the end of the segment should be set to 1. By strictly we mean that the label of segment_end_y should be 0 while, the `count` following labels should be ones.
    :param y: the labels to modify
    :param segment_end_ms: the end of the segment in ms
    :param count: the number of labels to set 1
    :return: the modified labels
    """
    _, Ty = y.shape
    segment_end_y = int(segment_end_ms * Ty / float(SAMPLE_LEN))
    if segment_end_y < Ty:
        y[0, segment_end_y + 1:segment_end_y + count + 1] = 1
    return y

In [None]:
def create_training_example(background, positives, negatives, max_positives=5, max_negatives=2):
    """
    Creates a training example with a given background, activates, and negatives.
    :param background: a 10-second background audio recording
    :param positives: a list of audio segments of the word "activate"
    :param negatives: a list of audio segments of random words that are not "activate"
    :return:
    x -- the spectrogram of the training example
    y -- the label at each time step of the spectrogram
    """

    # Make background quieter
    background = background - 20

    y = np.zeros((1, Ty))
    previous_segments = []

    # Select 0-5 random positive audio clips from the entire list
    number_of_positives = np.random.randint(0, max_positives)
    random_indices = np.random.randint(len(positives), size=number_of_positives)
    random_positives = [positives[i] for i in random_indices]
    for positive in random_positives:
        background, segment_time = insert_audio_clip(background, positive, previous_segments)
        segment_start, segment_end = segment_time
        y = insert_ones(y, segment_end)

    # Select 0-2 random negative audio clips from the entire list
    number_of_negatives = np.random.randint(0, max_negatives)
    random_indices = np.random.randint(len(negatives), size=number_of_negatives)
    random_negatives = [negatives[i] for i in random_indices]
    for negative in random_negatives:
        background, _ = insert_audio_clip(background, negative, previous_segments)

    train_file = os.path.join("data", "train.wav")
    background = match_target_amplitude(background, -20.0)
    _ = background.export(train_file, format="wav")
    x = get_spectrogram(train_file)

    return x, y

In [None]:
# Generate one example
x, y = create_training_example(backgrounds[0], positives, negatives)

In [None]:
# Audio after overlaying positive and negative audio clips
IPython.display.Audio(os.path.join("data", "train.wav"))

In [None]:
# Origin Audio clip
IPython.display.Audio(os.path.join(WWD_PATH, "examples", "example_train.wav"))

In [None]:
# The Y labels of modified audio clip
plt.plot(y[0])

__Generating samples (optional)__

In [None]:
# The number of samples to generate
n_samples = 1000

X_train = []
Y_train = []
for i in range(0, n_samples):
    if i % 10 == 0:
        print(i)
    x, y = create_training_example(backgrounds[i % 2], positives, negatives)
    # Spectrogram outputs (freqs, Tx) and we want (Tx, freqs) to input into the model
    X_train.append(x.swapaxes(0,1))
    Y_train.append(y.swapaxes(0,1))
X_train = np.array(X_train)
Y_train = np.array(Y_train)

In [None]:
# Save the data for further uses
np.save(os.path.join(WWD_PATH, "XY_train", "X_train.npy"), X_train)
np.save(os.path.join(WWD_PATH, "XY_train", "Y_train.npy"), Y_train)

__Load previously generated samples__

In [None]:
# Training set
X_train = np.load(os.path.join(WWD_PATH, "XY_train", "X_train.npy"))
Y_train = np.load(os.path.join(WWD_PATH, "XY_train", "Y_train.npy"))

In [None]:
print(f"X (training) shape: {X_train.shape}")
print(f"Y (training) shape: {Y_train.shape}")

In [None]:
# Validation set
X_dev = np.load(os.path.join(WWD_PATH, "XY_dev", "X_dev.npy"))
Y_dev = np.load(os.path.join(WWD_PATH, "XY_dev", "Y_dev.npy"))

In [None]:
print(f"X (dev) shape: {X_dev.shape}")
print(f"Y (dev) shape: {Y_dev.shape}")

# Model

In [None]:
def create_model(input_shape):
    """
    Function creating the model's graph in Keras.
    :param input_shape: shape of the model's input data (using Keras conventions)
    :return: Keras model instance
    """
    return tf.keras.Sequential([
        tf.keras.Input(shape=input_shape),
        tf.keras.layers.Conv1D(filters=196,kernel_size=15,strides=4),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation(activation="relu"),
        tf.keras.layers.Dropout(rate=0.8),
        tf.keras.layers.GRU(units=128, return_sequences=True),
        tf.keras.layers.Dropout(rate=0.8),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.GRU(units=128, return_sequences=True),
        tf.keras.layers.Dropout(rate=0.8),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(rate=0.8),
        tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation="sigmoid"))
    ])

__Create model__

In [None]:
model = create_model(input_shape = (Tx, n_freq))

In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6, beta_1=0.9, beta_2=0.999),
    metrics=["accuracy"])

__Fit model__

In [None]:
# Fit the neural network
history = model.fit(X_train, Y_train, batch_size=4, epochs=64)

In [None]:
loss = history.history["loss"]
epochs = len(loss)

plt.plot(range(len(loss)), loss, 'r', label='Training loss')
plt.title('Training loss')
plt.legend(loc=0)
plt.show()

In [None]:
loss, acc, = model.evaluate(X_dev, Y_dev)
print("Dev set accuracy = ", acc)

__Predicting__

In [None]:
chime_file = os.path.join(WWD_PATH, "examples", "chime.wav")

In [None]:
def detect_triggerword(model, filename):
    plt.subplot(2, 1, 1)

    # Correct the amplitude of the input file before prediction
    audio_clip = AudioSegment.from_wav(filename)
    audio_clip = match_target_amplitude(audio_clip, -20.0)
    temp_file = os.path.join("data", "temp.wav")
    _ = audio_clip.export(temp_file, format="wav")

    x = get_spectrogram(temp_file)
    # Spectrogram outputs (freqs, Tx) and we want (Tx, freqs) to input into the model
    x = x.swapaxes(0,1)
    x = np.expand_dims(x, axis=0)
    predictions = model.predict(x)

    plt.subplot(2, 1, 2)
    plt.plot(predictions[0,:,0])
    plt.ylabel('probability')
    plt.show()
    return predictions

In [None]:
def chime_on_activate(filename, predictions, threshold):
    audio_clip = AudioSegment.from_wav(filename)
    chime = AudioSegment.from_wav(chime_file)
    consecutive_timesteps = 0
    i = 0
    while i < Ty:
        consecutive_timesteps += 1
        if consecutive_timesteps > 20:
            audio_clip = audio_clip.overlay(chime, position=((i / Ty) * audio_clip.duration_seconds) * 1000)
            consecutive_timesteps = 0
            i = 75 * (i // 75 + 1)
            continue
        if predictions[0, i, 0] < threshold:
            consecutive_timesteps = 0
        i += 1

    audio_clip.export("chime_output.wav", format='wav')

In [None]:
FILE1 = os.path.join(WWD_PATH, "dev", "2.wav")
prediction = detect_triggerword(model, FILE1)
chime_on_activate(FILE1, prediction, 0.5)
IPython.display.Audio("./chime_output.wav")