In [1]:
import numpy as np
import pandas as pd

import keras
import tensorflow as tf

from sklearn import preprocessing
from keras import layers

dataset = pd.read_pickle("DATA_diz.pkl")


2022-05-19 11:43:30.711136: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-19 11:43:30.711153: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
dataset.keys()

dict_keys(['song_id', 'song_timeSeries', 'MFCCs', 'Mel_spectrograms', 'sampling_rate', 'label'])

In [3]:
genre_dict = {
    0 : 'jazz',
    1 : 'rock',
    2 : 'hiphop',
    3 : "metal",
    4 : "pop",
    5 : "disco",
    6 : "blues",
    7 : "classical",
    8 : "country",
    9 : "reggae",
}

### Selelct all the songs of a genre in order to generate music of that genre
Only keep the MFCCs column and use that as input of the variational autoencoder

In [24]:
df = pd.DataFrame(dataset)

genre = 0 # jazz

#df = df.loc[df['label'] == genre_dict[genre]]
df = df[['MFCCs']]

df = [value[0].T for value in df.values]
df = np.array(df)

print(df.shape)

(5991, 216, 13, 1)


### Define a Sampling layer for the latent variables

In [25]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

### Define the Encoder Network

In [81]:
latent_dim = 2
filters = (16, 32)

encoder_inputs = keras.Input(shape=(216, 13, 1))
# encoder_inputs = keras.Input(shape=(28, 28, 1))
x = layers.Conv2D(filters[0], 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(filters[1], (2, 3), activation="relu", strides=2, padding="valid")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_31 (InputLayer)          [(None, 216, 13, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d_32 (Conv2D)             (None, 108, 7, 16)   160         ['input_31[0][0]']               
                                                                                                  
 conv2d_33 (Conv2D)             (None, 54, 3, 32)    3104        ['conv2d_32[0][0]']              
                                                                                                  
 flatten_16 (Flatten)           (None, 5184)         0           ['conv2d_33[0][0]']        

### Define the Decoder

In [82]:
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(54 * 3 * filters[1], activation="relu")(latent_inputs)
x = layers.Reshape((54, 3, filters[1]))(x)
x = layers.Conv2DTranspose(filters[1], 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(filters[0], (2, 3), activation="relu", strides=2, padding="valid")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_32 (InputLayer)       [(None, 2)]               0         
                                                                 
 dense_31 (Dense)            (None, 5184)              15552     
                                                                 
 reshape_14 (Reshape)        (None, 54, 3, 32)         0         
                                                                 
 conv2d_transpose_42 (Conv2D  (None, 108, 6, 32)       9248      
 Transpose)                                                      
                                                                 
 conv2d_transpose_43 (Conv2D  (None, 216, 13, 16)      3088      
 Transpose)                                                      
                                                                 
 conv2d_transpose_44 (Conv2D  (None, 216, 13, 1)       145 

### Put everything together in the Model

In [83]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.mean_squared_error(data, reconstruction), axis=(1, 2)
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = 10 * reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

### Train the VAE

In [84]:
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
mnist_digits = np.concatenate([x_train, x_test], axis=0)
mnist_digits = np.expand_dims(mnist_digits, -1).astype("float32") / 255

print(df.shape)
df_flat = df.reshape(-1, 216 * 13)
df_flat = preprocessing.normalize(df_flat)
print(df_flat.shape)
df_flat = df_flat.reshape(-1, 216, 13, 1)
print(df_flat.shape)

vae = VAE(encoder, decoder)
vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1))
vae.fit(df, epochs=30, batch_size=256)

(5991, 216, 13, 1)
(5991, 2808)
(5991, 216, 13, 1)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30

KeyboardInterrupt: 