In [1]:
# only when open in colab
!git init .
!git remote add origin https://github.com/coyote-bergstein/compression-using-neural-networks.git
!git pull origin speech-commands

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 41 (delta 14), reused 23 (delta 6), pack-reused 0[K
Unpacking objects: 100% (41/41), 4.96 MiB | 3.51 MiB/s, done.
From https://github.com/coyote-bergstein/compression-using-neural-networks
 * branch            speech-commands -> FETCH_HEAD
 * [new branch]      speech-commands -> origin/speech-commands


In [2]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import librosa

In [4]:
# only when open in colab
import importlib
import source
from source.utils import generate_spectrogram, reconstruct_audio
from source.constants import HOP_LENGTH, N_FFT, AUDIO_SAMPLE_RATE
importlib.reload(source.constants)
importlib.reload(source.utils)

<module 'source.utils' from '/content/source/utils.py'>

## Dataset

In [5]:
ds = tfds.load('speech_commands', split='train')


Downloading and preparing dataset 2.37 GiB (download: 2.37 GiB, generated: 8.17 GiB, total: 10.53 GiB) to /root/tensorflow_datasets/speech_commands/0.0.3...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/85511 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/0.0.3.incomplete5K5XXT/speech_commands-train.tfrecord*...:…

Generating validation examples...:   0%|          | 0/10102 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/0.0.3.incomplete5K5XXT/speech_commands-validation.tfrecord…

Generating test examples...:   0%|          | 0/4890 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/0.0.3.incomplete5K5XXT/speech_commands-test.tfrecord*...: …



Dataset speech_commands downloaded and prepared to /root/tensorflow_datasets/speech_commands/0.0.3. Subsequent calls will reuse this data.


## Audio processing testing

In [13]:
import IPython.display as ipd
from source.utils import generate_spectrogram, reconstruct_audio
from source.constants import HOP_LENGTH, N_FFT, AUDIO_SAMPLE_RATE, TRACK_DURATION


signals = []

for example in ds.take(5):
  audio_np = example['audio'].numpy()
  audio_np = audio_np/audio_np.max()
  signals.append(audio_np)


for signal in signals:
  S = generate_spectrogram(signal, HOP_LENGTH, N_FFT)
  print(S.shape)
  reconstructed_signal = reconstruct_audio(S, HOP_LENGTH, N_FFT)

  print("Original Audio:")
  ipd.display(ipd.Audio(signal, rate=AUDIO_SAMPLE_RATE))

  print("Reconstructed Audio:")
  ipd.display(ipd.Audio(reconstructed_signal, rate=AUDIO_SAMPLE_RATE))

print(N_FFT, HOP_LENGTH)

(128, 126)
Original Audio:


Reconstructed Audio:


(128, 126)
Original Audio:


Reconstructed Audio:


(128, 126)
Original Audio:


Reconstructed Audio:


(128, 126)
Original Audio:


Reconstructed Audio:


(128, 126)
Original Audio:


Reconstructed Audio:


512 128


## Define model architecture
### TODO model architecture must be changed

In [9]:
n_mels = 128

In [67]:
def pad_vectors(audio_np):
  target_length = AUDIO_SAMPLE_RATE * TRACK_DURATION
  if audio_np.shape[0] < target_length:
    audio_np = np.pad(audio_np, (0, target_length - audio_np.shape[0]))
  else:
    audio_np = audio_np[:target_length]

  return audio_np

def preprocess(example):
    audio = example.numpy()
    audio = audio/audio.max()
    audio = pad_vectors(audio)

    S = generate_spectrogram(audio, HOP_LENGTH, N_FFT)
    S = np.expand_dims(S, axis=-1)
    return S.astype(np.float32)

def preprocess_with_py_function(example):
    audio = tf.py_function(preprocess, [example['audio']], [tf.float32])
    # audio.set_shape([n_mels, None])
    return audio

In [68]:
ds_train = ds.map(preprocess_with_py_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.map(lambda x: (x, x))
ds_train = ds_train.cache().batch(32).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [69]:
from tensorflow.keras import layers, models
from tensorflow.keras import backend

input_shape = (n_mels, 126, 1)

# Encoder
encoder_input = layers.Input(shape=input_shape)
x = layers.Conv2D(16, (3, 3), activation='relu', padding='same')(encoder_input)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
shape_before_dense = backend.int_shape(x)[1:]
x = layers.Flatten()(x)
encoded = layers.Dense(20)(x)

encoder = tf.keras.Model(encoder_input, encoded, name="encoder")

# Decoder
decoder_input = layers.Input(shape=20, name="decoder_input")
x = layers.Dense(np.prod(shape_before_dense), name="decoder_dense")(decoder_input)
x = layers.Reshape(target_shape=shape_before_dense)(x)
x = layers.Conv2DTranspose(8, (3, 3), activation='relu', padding='same')(x)
x = layers.Conv2DTranspose(8, (3, 3), activation='relu', padding='same')(x)
x = layers.Conv2DTranspose(16, (3, 3), activation='relu', padding='same')(x)
decoded = layers.Conv2DTranspose(1, (3, 3), activation='relu', padding='same')(x)

decoder = tf.keras.Model(decoder_input, decoded, name="decoder")

In [70]:
from source.constants import OPTIMIZER, LOSS

autoencoder = tf.keras.Model(encoder_input, decoder(encoder(encoder_input)), name="autoencoder")
autoencoder.compile(optimizer=OPTIMIZER, loss=LOSS)
autoencoder.summary()

Model: "autoencoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_17 (InputLayer)       [(None, 128, 126, 1)]     0         
                                                                 
 encoder (Functional)        (None, 20)                2582404   
                                                                 
 decoder (Functional)        (None, 128, 126, 1)       2711985   
                                                                 
Total params: 5294389 (20.20 MB)
Trainable params: 5294389 (20.20 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = autoencoder.fit(ds_train, epochs=20, verbose=1)

Epoch 1/20