## Autoencoder for Song Orders:

In [1]:
from Autoencoder_alla_Valerio import Autoencoder
from Variational_Autoencoder_alla_Valerio import VAE
from LSTM_Autoencoder import LSTM_Autoencoder
import numpy as np
import matplotlib.pyplot as plt
from Snippets import Snippets
from IPython.display import display, Audio
import librosa
from tensorflow.keras.layers import Masking
import tensorflow as tf

### Input Data:
We load the array with all song_orders. We need to bring them all to the same size, so we can use them as training data

In [2]:
subfolder = "0.25_16"
model_name = "Valerio_23927"
song_orders = np.load("data_and_models\\" + subfolder + "\\" + model_name +"song_orders.npy", allow_pickle=True)

We can normalise the data and denormalise it later

In [3]:
# flattened_orders = np.asarray([item for order in song_orders for item in order])
# old_min = flattened_orders.min()
# old_max = flattened_orders.max()

For the training of the autoencoder, we need all song-orders to be of the same length. Additionally the length has to be an even number. We bring each song to the length of the longest song by adding the value -1. 

In [4]:
max_song_length = 0

for i in range(0, len(song_orders)):
    if len(song_orders[i]) > max_song_length:
        max_song_length = len(song_orders[i])
        if not (max_song_length % 2) == 0: #increase max_song_length to even number. This is crutial for the Shape of Autoencoder input data.
            max_song_length += 1       
print("The longest song had: " + str(max_song_length) +"snippets")

x_train = []
for order in song_orders:
    if order.shape[0] < max_song_length:
        padded_order = np.pad(order,((0,max_song_length-order.shape[0]),(0,0)), constant_values=(0,0))
        x_train.append(padded_order)
x_train = np.asarray(x_train)
#x_train = x_train.reshape(x_train.shape[0],x_train.shape[1],x_train.shape[2], 1)

print("Our trainingsdata has the shape: " + str(x_train.shape))

The longest song had: 2120snippets
Our trainingsdata has the shape: (22, 2120, 128)


In [5]:
old_min = x_train.min()
old_max = x_train.max()

def normalise(array, new_min, new_max, old_min, old_max):
        norm_array = (array - old_min) / (old_max - old_min)
        norm_array = norm_array * (new_max - new_min) + new_min
        return norm_array

x_train = normalise(x_train, 0, 1, old_min, old_max)
mask_value=x_train[0][-1][-1]


### Build the model
We can build a new model

In [6]:
autoencoder = LSTM_Autoencoder(
    input_shape=(x_train.shape[1],x_train.shape[2]),
    lstm_dims=[],
    latent_space_dim=32,
    mask_value=mask_value
)
#autoencoder.summary()

In [7]:
LEARNING_RATE =  0.0001
BATCH_SIZE = 3
EPOCHS = 100

autoencoder.compile_model(LEARNING_RATE)
autoencoder.train(x_train, BATCH_SIZE, EPOCHS)

Train on 22 samples
Epoch 1/100
Epoch 2/100

KeyboardInterrupt: 

In [None]:
autoencoder.save("Autoencoder_SongOrders" + str(autoencoder.latent_space_dim) + "D_" + subfolder)
autoencoder.summary()

# Check Results
Check how good the autoencoder is in reconstructing a song order.

In [None]:
latent_representation = autoencoder.encoder.predict(x_train)
reconstructed_data = autoencoder.decoder.predict(latent_representation)
reconstructed_data = Snippets._denormalise(reconstructed_data, 0, 1, old_min, old_max)

In [None]:
x_train = Snippets._denormalise(x_train, 0, 1, old_min, old_max)

In [None]:
song_num = 0

print("This is the original")
plt.figure(figsize=(20,20))
plt.plot(x_train[song_num][:, 0], x_train[song_num][:, 1], '-.o', markersize=5, markerfacecolor='red')
plt.show()

In [None]:
print("This is the reconstruction")
plt.figure(figsize=(20,20))
plt.plot(reconstructed_data[song_num][:, 0], reconstructed_data[song_num][:, 1], '-.o', markersize=5, markerfacecolor='red')
plt.show()

In [None]:
WIN_LENGTH = 690*2
HOP_LENGTH = 690
N_FFT = 690*2

snippet_model_name = "VAE_Vocals_128D_23927samples_20Epochs"
snippet_autoencoder = VAE.load("data_and_models\\" + subfolder +"\\" + snippet_model_name)

reconstructed_data = reconstructed_data.reshape(reconstructed_data.shape[0], reconstructed_data.shape[1], reconstructed_data.shape[2])
reconstructed_order = reconstructed_data[song_num]
print(reconstructed_order.shape)
      
reconstructed_signal, reconstructed_spectos = Snippets.latent_representation_to_pca(latent_representations=reconstructed_order,
                                                                                    model=snippet_autoencoder, 
                                                                                    hop_length=HOP_LENGTH, 
                                                                                    n_fft=N_FFT, 
                                                                                    win_length=WIN_LENGTH)

In [None]:
display(Audio(reconstructed_signal, rate=44100))

In [None]:
folder_path = 'demo_data\\stems'
paths = librosa.util.find_files(folder_path, ext=['wav'])
original_song, _ = librosa.load(paths[song_num],sr=44100, mono=True)

print("This is the original song: \n")
display(Audio(original_song, rate=44100))


https://machinelearningmastery.com/lstm-autoencoders/