In [None]:
import import_ipynb
import pickle
import lws
import numpy as np
import librosa
import audio_model.preprocess as preprocess
from audio_model.autoencoder import VAE

In [None]:
#use matching parameters in preprocess.ipynb
SAMPLE_RATE = 22050
DURATION = 1.48
MONO = True
FILE_PATH = "random_audio/sugar.wav"
FRAME_SIZE = 512
HOP_LENGTH = 256

In [None]:
#load the audio clip
loader = preprocess.Loader(SAMPLE_RATE, DURATION, MONO)
signal = loader.load(FILE_PATH)

In [None]:
#Add padding
num_expected_samples = int(SAMPLE_RATE * DURATION)
if len(signal) < num_expected_samples:
    num_missing_samples = num_expected_samples - len(signal)
    signal = np.pad(signal, (0, num_missing_samples), mode='constant')

In [None]:
#extract log spectrogram
log_spectrogram_extractor = preprocess.LogSpectrogramExtractor(FRAME_SIZE, HOP_LENGTH)
log_spectrogram = log_spectrogram_extractor.extract(signal)

In [None]:
#load the minmax value of dataset for normalizing the signal
with open ("audio_model/minmax/min_max_values.pkl" , "rb")as f:
     min_max = pickle.load(f)

In [None]:
#normalize the signal and reshape it
MIN = 0
MAX = 1
normalized_array = (log_spectrogram - min_max['min']) / (min_max['max'] - min_max['min'])
normalized_array = normalized_array * (MAX - MIN) + MIN
normalized_array = normalized_array[..., np.newaxis]
normalized_array = np.array([normalized_array])

In [None]:
#get the latent respresentation and reconstructed spectrogram
vae = VAE.load("audio_model/model")
generated_spectrogram, audio_representation = vae.reconstruct(normalized_array)

In [None]:
#save the latent representation for the image model
np.save('audio_representation',audio_representation)

### Listen to reconstructed audio from the generated spectrogram

In [None]:
#convert the spectrogram to signal
#reshape the generated spectrogram to 3-d
log_spectrogram = generated_spectrogram[0]

# reshape the log spectrogram
log_spectrogram = log_spectrogram[:,:, 0]

# apply denormalisation
denormalized_log_spectrogram = (log_spectrogram - MIN) / (MAX - MIN)
denormalized_log_spectrogram = denormalized_log_spectrogram * (min_max["max"] - min_max["min"]) + min_max["min"]

# log spectrogram -> spectrogram
spectrogram = librosa.db_to_amplitude(denormalized_log_spectrogram)

# pad zero to conform frequency bin to 257 to fit lws input shape
spectrogram_padded = np.pad(spectrogram,((0,1),(0,1)))

# apply lws phase reconstruction
lws_processor=lws.lws(512,256, mode="speech") 
spectrogram_with_phase = lws_processor.run_lws(spectrogram_padded)

# apply lws isft
reconstructed_signal = lws_processor.istft(spectrogram_with_phase)

In [None]:
import IPython.display as ipd
ipd.Audio(reconstructed_signal, rate=SAMPLE_RATE)

### Generate an image

Restart kennel before running this part

In [None]:
import tensorflow as tf
import numpy as np
from PIL import Image

In [None]:
generator = tf.saved_model.load('image_model/generator_model')

In [None]:
audio_representation = np.load('audio_representation.npy')

In [None]:
generated_image = generator(audio_representation)

In [None]:
generated_image *= 255

In [None]:
Image.fromarray(generated_image[0].numpy().astype(np.uint8)).save('generated_image.png')