### Restore the latent representation from the image

In [None]:
import tensorflow as tf
import numpy as np
from PIL import Image

In [None]:
#function for finding the latent representation of an image
def find_closest_latent_vector(initial_vector, num_optimization_steps,
                               steps_per_image):
  images = []
  losses = []

  vector = tf.Variable(initial_vector)  
  optimizer = tf.optimizers.Adam(learning_rate=0.01)
  loss_fn = tf.losses.MeanAbsoluteError(reduction="sum")

  for step in range(num_optimization_steps):
    if (step % 100)==0:
      print()
    print('.', end='')
    with tf.GradientTape() as tape:
      image = generator(vector.read_value())[0]
      image = tf.cast(image, dtype=tf.double)
      if (step % steps_per_image) == 0:
        images.append(image.numpy())
        final_vector = vector
      target_image_difference = loss_fn(image, target_image[:,:,:3])
      # The latent vectors were sampled from a normal distribution. We can get
      # more realistic images if we regularize the length of the latent vector to 
      # the average length of vector from this distribution.
      regularizer = tf.cast(tf.abs(tf.norm(vector) - np.sqrt(latent_dim)), dtype=tf.float64)

      loss = target_image_difference + regularizer
      losses.append(loss.numpy())

    # Here we update the optimized vector
    grads = tape.gradient(loss, [vector])
    optimizer.apply_gradients(zip(grads, [vector]))
    
  return images, losses, final_vector

In [None]:
generator = tf.saved_model.load('image_model/generator_model')

In [None]:
#prepare the generated_image
target_image = Image.open('generated_image.png')
target_image = np.asarray(target_image)
target_image = target_image/255

In [None]:
LATENT_DIM = 400
INITIAL_VECTOR = tf.random.normal([1, LATENT_DIM])
NUM_OPTIMIZATION_STEPS = 1
STEPS_PER_IMAGE=5

images, loss, vector = find_closest_latent_vector(INITIAL_VECTOR, NUM_OPTIMIZATION_STEPS, STEPS_PER_IMAGE)

In [None]:
#convert the latent representation to numpy and save it
representation_from_image = vector.numpy()
np.save('representation_from_image',representation_from_image)

### reconstruct with latent representation restored from generated image

Restart kennel before running this part

In [None]:
import import_ipynb
import numpy as np
import pickle
import librosa
import lws
from audio_model.autoencoder import VAE

In [None]:
#load the audio model to generate a spectrogram from the latent representation
vae = VAE.load("audio_model/model")
representation_from_image = np.load("representation_from_image.npy")
spectrogram_from_image = vae.decoder.predict(representation_from_image)

In [None]:
#convert the spectrogram to signal

#load the minmax value of dataset for normalizing the signal
with open ("audio_model/minmax/min_max_values.pkl" , "rb")as f:
     min_max = pickle.load(f)
        
MIN = 0
MAX = 1

#reshape the generated spectrogram to 3-d
log_spectrogram = spectrogram_from_image[0]

# reshape the log spectrogram
log_spectrogram = log_spectrogram[:,:, 0]

# apply denormalisation
denormalized_log_spectrogram = (log_spectrogram - MIN) / (MAX - MIN)
denormalized_log_spectrogram = denormalized_log_spectrogram * (min_max["max"] - min_max["min"]) + min_max["min"]

# log spectrogram -> spectrogram
spectrogram = librosa.db_to_amplitude(denormalized_log_spectrogram)

# pad zero to conform frequency bin to 257 to fit lws input shape
spectrogram_padded = np.pad(spectrogram,((0,1),(0,1)))

# apply lws phase reconstruction
lws_processor=lws.lws(512,256, mode="speech") 
spectrogram_with_phase = lws_processor.run_lws(spectrogram_padded)

# apply lws isft
reconstructed_signal = lws_processor.istft(spectrogram_with_phase)

In [None]:
#listen to the result
SAMPLE_RATE = 22050

import IPython.display as ipd
ipd.Audio(reconstructed_signal, rate=SAMPLE_RATE)