In [189]:
import numpy as np
import pandas as pd
import pretty_midi
import matplotlib.pyplot as plt

from utils import plot_piano_roll

In [190]:
# TODO:
# group according to bar length
# do hidden state concat or not in decoder bottom lstm cell

In [191]:
# variable declarations

TEMPO = 120.  # 1 sec = 2 beats
SIXTEENTH_NOTE_BEATS = 0.25  # 1 16th note = 0.25 beats
SIXTEENTH_NOTE_LEN = SIXTEENTH_NOTE_BEATS / (TEMPO / 60.)  # 1 16th note = 0.125 sec
ONE_BAR_LEN = SIXTEENTH_NOTE_LEN * 16  # 16 16th notes = 1 bar = 2 sec
FOUR_BAR_LEN = ONE_BAR_LEN * 4  # 16-bar = 32 sec

EVENT_SIZE = 64

In [192]:
# helper function definitions

def midi_notes2notes_df(notes):

    prev_note = notes[0]

    processed_notes = []
    for temp_note in notes:
        pitch = temp_note.pitch
        duration = temp_note.end - temp_note.start
        step = temp_note.start - prev_note.start
        prev_note = temp_note

        processed_notes.append({'pitch':pitch,'duration':duration,'step':step})

    notes_df = pd.DataFrame.from_dict(processed_notes)

    return notes_df


def notesdf2midi_notes(notes_df):

    recovered_midi_notes = []
    current_step = 0.0
    for _,row in notes_df.iterrows():
        note_duration = row['duration']
        current_step = current_step + row['step']
        recovered_midi_notes.append(pretty_midi.Note(velocity=100,pitch=int(row['pitch']),start=current_step,end=current_step+note_duration))
    
    return recovered_midi_notes


# reconstruction of recovered notes
def pred_df2midi_file(notes_df):
    recovered_notes = notesdf2midi_notes(notes_df.iloc[:64])

    pm = pretty_midi.Instrument(program=0,is_drum=False)
    pm.notes = recovered_notes

    recovered_midi_file = pretty_midi.PrettyMIDI(initial_tempo=120.)
    recovered_midi_file.instruments = [pm]
    recovered_midi_file.time_signature_changes = [pretty_midi.TimeSignature(4,4,0.0)]
    recovered_midi_file.write('../data/processed/mini_guitar.mid')

    return

In [193]:
midi_data = pretty_midi.PrettyMIDI(midi_file='../data/midi_dump/turkish.mid')

In [194]:
def midi_data2tensor(midi_data):
    notes = midi_data.instruments[0].notes
    notes_df = midi_notes2notes_df(notes)
    split_indices = np.arange(start=EVENT_SIZE,stop=len(notes_df),step=EVENT_SIZE)
    single_batch_of_events = np.stack(np.split(notes_df.values,split_indices,axis=0)[:-1],axis=0)
    return single_batch_of_events

In [195]:
import glob
midi_list = glob.glob("../data/maestro-v3.0.0/2018/*.midi")

complete_batch_of_events = None

for midi_file in midi_list:
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    single_batch_of_events = midi_data2tensor(midi_data)
    if complete_batch_of_events is None:
        complete_batch_of_events = single_batch_of_events
    else:
        complete_batch_of_events= np.concatenate((complete_batch_of_events,single_batch_of_events),axis=0)

print(complete_batch_of_events.shape)

KeyboardInterrupt: 

In [None]:
import tensorflow as tf
train_ds = tf.data.Dataset.from_tensor_slices(complete_batch_of_events)
train_ds = train_ds.shuffle(1000).batch(6)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)

In [None]:
for temp_ds in train_ds:
    break

In [None]:
# MAKE DIMENSION OF X : 128 + 2

In [303]:
from tensorflow.keras.layers import Bidirectional,LSTM,Dense,LSTMCell
from tensorflow.keras import Input

CONDUCTOR_LEN = 4


class MusicVAE_Decoder(tf.keras.Model):
    def __init__(self,latent_dim):
        super().__init__()
        self.latent_dim = latent_dim
        self.conductor_dense = Dense(units=512,activation='tanh')
        self.conductor_lstm_1 = LSTM(units=512,return_sequences=True)
        self.conductor_lstm_2 = LSTM(units=256,return_sequences=True)
        self.bottom_lstm_input_dense = Dense(units=32,activation='tanh')
        self.bottom_lstm_cell = LSTMCell(units=32)
        self.bottom_lstm_pitch_dense = Dense(units=128)
        self.bottom_lstm_duration_dense = Dense(units=1)
        self.bottom_lstm_step_dense = Dense(units=1)

    def call(self,z,x,teacher_forcing=True): # teacher_forcing = False when prediction

        # CONDUCTOR RNN
        conductor_rnn_h0 = self.conductor_dense(z)
        batch_size,_ = conductor_rnn_h0.shape
        conductor_input = tf.zeros(shape=(batch_size,CONDUCTOR_LEN,1))
        conductor_output = self.conductor_lstm_1(inputs=conductor_input, initial_state=[conductor_rnn_h0, conductor_rnn_h0])
        conductor_output = self.conductor_lstm_2(conductor_output)
        bottom_input = self.bottom_lstm_input_dense(conductor_output) 

        # BOTTOM RNN
        total_seq_len = x.shape[1]
        subseq_len = int(total_seq_len/CONDUCTOR_LEN)
        
        pred = None
        temp_pred = tf.zeros_like(x[:, 0, :])
        for subsec_idx in range(CONDUCTOR_LEN):
            bottom_rnn_h0 = bottom_input[:,subsec_idx,:]
            subseq_x = x[:,subsec_idx*subseq_len:(subsec_idx+1)*subseq_len,:]

            h_next,c_next,subseq_pred = None,None,None
            for j in range(0, subseq_x.shape[1]):
                if j == 0:
                    if teacher_forcing:
                        _, (h_n, c_n) = self.bottom_lstm_cell(inputs=tf.zeros_like(subseq_x[:, j, :]), states=[bottom_rnn_h0,bottom_rnn_h0])
                    else:
                        _, (h_n, c_n) = self.bottom_lstm_cell(inputs=temp_pred, states=[bottom_rnn_h0,bottom_rnn_h0])
                    h_next,c_next = h_n, c_n
                    temp_pred_pitch = self.bottom_lstm_pitch_dense(h_next)
                    temp_pred_duration = self.bottom_lstm_duration_dense(h_next)
                    temp_pred_step = self.bottom_lstm_step_dense(h_next)
                    # think about normalization more here
                    temp_pred = tf.concat([tf.cast(tf.expand_dims(tf.argmax(temp_pred_pitch,axis=1),axis=1)/128,dtype=tf.float32),temp_pred_duration,temp_pred_step],axis=1)
                    subseq_pred = tf.expand_dims(temp_pred,axis=1)
                else:
                    if teacher_forcing:
                        _, (h_n, c_n) = self.bottom_lstm_cell(inputs=subseq_x[:, j-1, :], states=[h_next,c_next])
                    else:
                        _, (h_n, c_n) = self.bottom_lstm_cell(inputs=temp_pred, states=[h_next,c_next])
                    h_next,c_next = h_n, c_n
                    temp_pred_pitch = self.bottom_lstm_pitch_dense(h_next)
                    temp_pred_duration = self.bottom_lstm_duration_dense(h_next)
                    temp_pred_step = self.bottom_lstm_step_dense(h_next)
                    # think about normalization more here
                    temp_pred = tf.concat([tf.cast(tf.expand_dims(tf.argmax(temp_pred_pitch,axis=1),axis=1)/128,dtype=tf.float32),temp_pred_duration,temp_pred_step],axis=1)
                    subseq_pred = tf.concat([subseq_pred,tf.expand_dims(temp_pred,axis=1)],axis=1)


            if subsec_idx==0:
                pred = subseq_pred
            else:
                pred = tf.concat([pred,subseq_pred],axis=1)
            

        return pred



In [304]:
decoder = MusicVAE_Decoder(4)

z = tf.random.normal(shape=(6,256))
decoder(z,temp_ds)

<tf.Tensor: shape=(6, 64, 3), dtype=float32, numpy=
array([[[ 0.6015625 , -0.02405372,  0.02968621],
        [ 0.1640625 , -0.50718033,  0.2909191 ],
        [ 0.1640625 , -0.65849984,  0.31725612],
        ...,
        [ 0.1640625 , -0.93025535,  0.14259909],
        [ 0.1640625 , -0.9576364 ,  0.13737299],
        [ 0.1640625 , -0.9365851 ,  0.12681685]],

       [[ 0.984375  ,  0.01835872, -0.02815661],
        [ 0.1640625 , -0.4081475 ,  0.22805421],
        [ 0.1640625 , -0.61617005,  0.2970775 ],
        ...,
        [ 0.1640625 , -0.8472149 ,  0.07395652],
        [ 0.1640625 , -0.957511  ,  0.083541  ],
        [ 0.1640625 , -0.9403447 ,  0.07815933]],

       [[ 0.3203125 , -0.02009549,  0.04322419],
        [ 0.1640625 , -0.40743282,  0.21402963],
        [ 0.1640625 , -0.6472487 ,  0.28843862],
        ...,
        [ 0.1640625 , -0.90947396,  0.10162848],
        [ 0.1640625 , -0.9552187 ,  0.10269921],
        [ 0.1640625 , -0.95545286,  0.10035466]],

       [[ 0.453125  ,

In [259]:
# RNN SAMPLING

lstm_cell = LSTMCell(units=36)
dense_last = Dense(units=3)

h0 = tf.random.normal(shape=(temp_ds.shape[0],36))
c0 = tf.random.normal(shape=(temp_ds.shape[0],36))

h_next,c_next,temp_pred,out = None,None,None,None
for j in range(0, temp_ds.shape[1]):
    if j == 0:
        _, (h_n, c_n) = lstm_cell(inputs=tf.zeros_like(temp_ds[:, j, :]), states=[h0,c0])
        h_next,c_next = h_n, c_n
        temp_pred = dense_last(h_next)
        out = tf.expand_dims(temp_pred,axis=1)
    else:
        _, (h_n, c_n) = lstm_cell(inputs=temp_pred, states=[h_next,c_next])
        h_next,c_next = h_n, c_n
        temp_pred = dense_last(h_next)
        out = tf.concat([out,tf.expand_dims(temp_pred,axis=1)],axis=1)


# TEACHER FORCING

lstm_cell = LSTMCell(units=36)
dense_last = Dense(units=3)

h0 = tf.random.normal(shape=(temp_ds.shape[0],36))
c0 = tf.random.normal(shape=(temp_ds.shape[0],36))

h_next,c_next,temp_pred,out = None,None,None,None
for j in range(0, temp_ds.shape[1]):
    if j == 0:
        _, (h_n, c_n) = lstm_cell(inputs=tf.zeros_like(temp_ds[:, j, :]), states=[h0,c0])
        h_next,c_next = h_n, c_n
        temp_pred = dense_last(h_next)
        out = tf.expand_dims(temp_pred,axis=1)
    else:
        _, (h_n, c_n) = lstm_cell(inputs=temp_ds[:, j-1, :], states=[h_next,c_next])
        h_next,c_next = h_n, c_n
        temp_pred = dense_last(h_next)
        out = tf.concat([out,tf.expand_dims(temp_pred,axis=1)],axis=1)

out.shape

(6, 64, 3)


In [186]:
from tensorflow.keras.layers import Bidirectional,LSTM,Dense
from tensorflow.keras import Input

def get_encoder(latent_dim):
    inputs = tf.keras.Input(shape = (EVENT_SIZE,3))
    x = Bidirectional(LSTM(units=1024,return_sequences=True))(inputs)
    x = Bidirectional(LSTM(units=1024,return_sequences=False))(x)
    mu = Dense(units=latent_dim)(x)
    rho = Dense(units=latent_dim)(x)
    Encoder = tf.keras.Model(inputs=inputs,outputs=[mu,rho])
    
    return Encoder

def get_decoder(latent_dim):
    z = tf.keras.Input(shape = (latent_dim,))
    x = tf.keras.layers.Dense(units=120, activation='relu')(z)
    x = tf.keras.layers.Dense(units=500, activation='relu')(x)
    decoded_img = tf.keras.layers.Dense(units=784)(x)
    Decoder = tf.keras.Model(inputs=z,outputs=[decoded_img])
    
    return Decoder

class VAE(tf.keras.Model):
    def __init__(self,latent_dim):
        super().__init__()
        self.latent_dim = latent_dim
        self.encoder_block = get_encoder(latent_dim)
        self.decoder_block = get_decoder(latent_dim)

    def call(self,img):
        z_mu,z_rho = self.encoder_block(img)

        epsilon = tf.random.normal(shape=z_mu.shape,mean=0.0,stddev=1.0)
        z = z_mu + tf.math.softplus(z_rho) * epsilon

        decoded_img = self.decoder_block(z)

        return z_mu,z_rho,decoded_img