In [None]:
import import_ipynb
import pickle
import lws
import numpy as np
import librosa
import audio_model.preprocess as preprocess
from audio_model.autoencoder import VAE

In [None]:
#use matching parameters in preprocess.ipynb
sample_rate = 22050
duration = 1.48
mono = True
file_path = "random_audio/sugar.wav"
frame_size = 512
hop_length = 256

In [None]:
#load the audio clip
loader = preprocess.Loader(sample_rate, duration, mono)
signal = loader.load(file_path)

In [None]:
#Add padding
num_expected_samples = int(sample_rate * duration)
if len(signal) < num_expected_samples:
    num_missing_samples = num_expected_samples - len(signal)
    signal = np.pad(signal, (0, num_missing_samples), mode='constant')

In [None]:
#extract log spectrogram
log_spectrogram_extractor = preprocess.LogSpectrogramExtractor(frame_size, hop_length)
log_spectrogram = log_spectrogram_extractor.extract(signal)

In [None]:
#load the minmax value for normalizing the signal
with open ("audio_model/minmax/min_max_values.pkl" , "rb")as f:
     min_max = pickle.load(f)

In [None]:
#normalize the signal and reshape it
norm_min = 0
norm_max = 1
norm_array = (log_spectrogram - min_max['min']) / (min_max['max'] - min_max['min'])
norm_array = norm_array * (norm_max - norm_min) + norm_min
norm_array_addaxis = norm_array[..., np.newaxis]
norm_array_addaxis = np.array([norm_array_addaxis])

In [None]:
#get the latent respresentation and reconstructed spectrogram
vae = VAE.load("audio_model/model")
generated_spectrograms, audio_representations = vae.reconstruct(norm_array_addaxis)

In [None]:
#save the latent representation for the image model
np.save('audio_representations',audio_representations)

### Listen to reconstructed audio from the generated spectrogram

In [None]:
#reshape the generated spectrogram to 3-d
log_spectrogram = generated_spectrograms[0]

# reshape the log spectrogram
log_spectrogram = log_spectrogram[:,:, 0]

# apply denormalisation
denorm_log_spec = (log_spectrogram - norm_min) / (norm_max - norm_min)
denorm_log_spec = denorm_log_spec * (min_max["max"] - min_max["min"]) + min_max["min"]

# log spectrogram -> spectrogram
spec = librosa.db_to_amplitude(denorm_log_spec)

# pad zero to conform frequency bin to 257 to fit lws input shape
spec_pad = np.pad(spec,((0,1),(0,1)))

# apply lws phase reconstruction
lws_processor=lws.lws(512,256, mode="speech") 
spec_phase = lws_processor.run_lws(spec_pad)

# apply lws isft
reconstruct_signal = lws_processor.istft(spec_phase)

In [None]:
import IPython.display as ipd
ipd.Audio(reconstruct_signal, rate=sample_rate)

### Generate an image

Restart kennel before running this part

In [None]:
import tensorflow as tf
import numpy as np
from PIL import Image

In [None]:
generator = tf.saved_model.load('image_model/gen_model')

In [None]:
audio_representations = np.load('audio_representations.npy')

In [None]:
generated_image = generator(audio_representations)

In [None]:
generated_image *= 255

In [None]:
Image.fromarray(generated_image[0].numpy().astype(np.uint8)).save('generated_image.png')