In [2]:
# only when open in colab
!git init .
!git remote add origin https://github.com/coyote-bergstein/compression-using-neural-networks.git
!git pull origin speech-commands-prev

Reinitialized existing Git repository in /content/.git/
error: remote origin already exists.
From https://github.com/coyote-bergstein/compression-using-neural-networks
 * branch            speech-commands-prev -> FETCH_HEAD
Already up to date.


In [3]:
!pip install pydub

!pip install numba==0.59.1 # fixes compability issue



In [4]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import librosa

In [13]:
# only when open in colab
import importlib
import source
from source.utils import generate_spectrogram, reconstruct_audio
from source.constants import HOP_LENGTH, N_FFT, AUDIO_SAMPLE_RATE
importlib.reload(source.constants)
importlib.reload(source.utils)

<module 'source.utils' from '/content/source/utils.py'>

## Dataset

In [6]:
ds = tfds.load('speech_commands', split='train')


## Audio processing testing

In [14]:
import IPython.display as ipd
from source.utils import generate_spectrogram, reconstruct_audio
from source.constants import HOP_LENGTH, N_FFT, AUDIO_SAMPLE_RATE, TRACK_DURATION


signals = []

for example in ds.take(5):
  audio_np = example['audio'].numpy()
  audio_np = audio_np/audio_np.max()
  signals.append(audio_np)


for signal in signals:
  S = generate_spectrogram(signal, HOP_LENGTH, N_FFT)
  print(S.shape)
  reconstructed_signal = reconstruct_audio(S, HOP_LENGTH, N_FFT)

  print("Original Audio:")
  ipd.display(ipd.Audio(signal, rate=AUDIO_SAMPLE_RATE))

  print("Reconstructed Audio:")
  ipd.display(ipd.Audio(reconstructed_signal, rate=AUDIO_SAMPLE_RATE))

print(N_FFT, HOP_LENGTH)

(128, 126)
Original Audio:


Reconstructed Audio:


(128, 126)
Original Audio:


Reconstructed Audio:


(128, 126)
Original Audio:


Reconstructed Audio:


(128, 126)
Original Audio:


Reconstructed Audio:


(128, 126)
Original Audio:


Reconstructed Audio:


512 128


## Define model architecture
### TODO model architecture must be changed

In [15]:
n_mels = 128

In [16]:
def pad_vectors(audio_np):
  target_length = AUDIO_SAMPLE_RATE * TRACK_DURATION
  if audio_np.shape[0] < target_length:
    audio_np = np.pad(audio_np, (0, target_length - audio_np.shape[0]))
  else:
    audio_np = audio_np[:target_length]

  return audio_np

def preprocess(example):
    audio = example.numpy()
    audio = audio/audio.max()
    audio = pad_vectors(audio)

    S = generate_spectrogram(audio, HOP_LENGTH, N_FFT)
    S = np.expand_dims(S, axis=-1)
    return S.astype(np.float32)

def preprocess_with_py_function(example):
    audio = tf.py_function(preprocess, [example['audio']], [tf.float32])
    # audio.set_shape([n_mels, None])
    return audio

In [17]:
ds_train = ds.map(preprocess_with_py_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.map(lambda x: (x, x))
ds_train = ds_train.cache().batch(32).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

### Current network

In [23]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the Encoder
def build_encoder(input_shape):
    encoder_input = layers.Input(shape=input_shape, name='encoder_input')

    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(encoder_input)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)

    encoder_output = layers.Flatten()(x)
    encoder_output = layers.Dense(128, activation='relu')(encoder_output)

    return models.Model(encoder_input, encoder_output, name='encoder')

# Define the Decoder
def build_decoder(encoded_shape):
    decoder_input = layers.Input(shape=(encoded_shape,), name='decoder_input')

    x = layers.Dense(16 * 16 * 32, activation='relu')(decoder_input)
    x = layers.Reshape((16, 16, 32))(x)
    x = layers.Conv2DTranspose(32, (3, 3), activation='relu', padding='same')(x)
    x = layers.UpSampling2D((2, 2))(x)
    x = layers.Conv2DTranspose(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.UpSampling2D((2, 2))(x)
    x = layers.Conv2DTranspose(128, (3, 3), activation='relu', padding='same')(x)
    x = layers.UpSampling2D((2, 2))(x)

    # Add a Cropping layer to match the exact input shape
    x = layers.Cropping2D(cropping=((0, 0), (0, 2)))(x)

    decoder_output = layers.Conv2DTranspose(1, (3, 3), activation='linear', padding='same')(x)

    return models.Model(decoder_input, decoder_output, name='decoder')

# Define the Autoencoder
def build_autoencoder(input_shape):
    encoder = build_encoder(input_shape)
    decoder = build_decoder(encoder.output_shape[1])

    autoencoder_input = layers.Input(shape=input_shape, name='autoencoder_input')
    encoded = encoder(autoencoder_input)
    decoded = decoder(encoded)

    return models.Model(autoencoder_input, decoded, name='autoencoder')

# Input shape for the spectrograms
input_shape = (128, 126, 1)  # Example shape, modify as needed

# Build and compile the autoencoder
autoencoder = build_autoencoder(input_shape)
autoencoder.compile(optimizer='adam', loss='mse')

# Summary of the model
autoencoder.summary()

Model: "autoencoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 autoencoder_input (InputLa  [(None, 128, 126, 1)]     0         
 yer)                                                            
                                                                 
 encoder (Functional)        (None, 128)               1142240   
                                                                 
 decoder (Functional)        (None, 128, 126, 1)       1159521   
                                                                 
Total params: 2301761 (8.78 MB)
Trainable params: 2301761 (8.78 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Previous network

In [None]:
from tensorflow.keras import layers, models
from tensorflow.keras import backend

input_shape = (n_mels, 126, 1)

# Encoder
encoder_input = layers.Input(shape=input_shape)
x = layers.Conv2D(16, (3, 3), activation='relu', padding='same')(encoder_input)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
shape_before_dense = backend.int_shape(x)[1:]
x = layers.Flatten()(x)
encoded = layers.Dense(20)(x)

encoder = tf.keras.Model(encoder_input, encoded, name="encoder")

# Decoder
decoder_input = layers.Input(shape=20, name="decoder_input")
x = layers.Dense(np.prod(shape_before_dense), name="decoder_dense")(decoder_input)
x = layers.Reshape(target_shape=shape_before_dense)(x)
x = layers.Conv2DTranspose(8, (3, 3), activation='relu', padding='same')(x)
x = layers.Conv2DTranspose(8, (3, 3), activation='relu', padding='same')(x)
x = layers.Conv2DTranspose(16, (3, 3), activation='relu', padding='same')(x)
decoded = layers.Conv2DTranspose(1, (3, 3), activation='relu', padding='same')(x)

decoder = tf.keras.Model(decoder_input, decoded, name="decoder")

In [None]:
from source.constants import OPTIMIZER, LOSS

autoencoder = tf.keras.Model(encoder_input, decoder(encoder(encoder_input)), name="autoencoder")
autoencoder.compile(optimizer=OPTIMIZER, loss=LOSS)
autoencoder.summary()

Model: "autoencoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_17 (InputLayer)       [(None, 128, 126, 1)]     0         
                                                                 
 encoder (Functional)        (None, 20)                2582404   
                                                                 
 decoder (Functional)        (None, 128, 126, 1)       2711985   
                                                                 
Total params: 5294389 (20.20 MB)
Trainable params: 5294389 (20.20 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Training

In [24]:
history = autoencoder.fit(ds_train, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
autoencoder.save('autoencoder_model.h5')

  saving_api.save_model(


In [27]:
ds_predict = ds.map(preprocess_with_py_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [48]:
# Test the autoencoder reconstruction
for example in ds_predict.take(5):

    print("Original Audio:")
    reconstructed_signal_uncompressed = reconstruct_audio(np.squeeze(example[0]), HOP_LENGTH, N_FFT)

    # Use autoencoder to compress and decompress the audio
    S_compressed = autoencoder.predict(np.array(example))
    reconstructed_signal_compressed = reconstruct_audio(np.squeeze(S_compressed), HOP_LENGTH, N_FFT)

    print("Original Audio:")
    ipd.display(ipd.Audio(reconstructed_signal_uncompressed, rate=AUDIO_SAMPLE_RATE))

    print("Reconstructed Audio:")
    ipd.display(ipd.Audio(reconstructed_signal_compressed, rate=AUDIO_SAMPLE_RATE))


Original Audio:
Original Audio:


Reconstructed Audio:


Original Audio:
Original Audio:


Reconstructed Audio:


Original Audio:
Original Audio:


Reconstructed Audio:


Original Audio:
Original Audio:


Reconstructed Audio:


Original Audio:
Original Audio:


Reconstructed Audio:
