Deep artificial neural network for expressive timing and dynamics predictions in musical pieces
---------------

This notebook loads the data generated from the note level processing notebook and uses them to train and test a long sequence-based artificial neural network for predicting the onset timing deviation and peak loudness level of notes from the MusicNet dataset pieces.


In [None]:
### Parameters to set:

runLocal = True  # False for using Google Colab
BATCH_SIZE = 128
seq_length = 200
decoder_units = 128
output_cols = ['peakLevel']
lr = 3e-3

#### Load and preprocess training data

In [None]:
import numpy as np
import pandas as pd
import pickle

#  read dataset

if runLocal:
    pathRoot = 'data/'
else:
    pathRoot = '/content/drive/My Drive/colab_data/'

with open(pathRoot + 'note_sequences.data', 'rb') as seq_path:
    sequences = pickle.load(seq_path)
with open(pathRoot + 'note_sequences_dict.data', 'rb') as filehandle:
    lex_to_ix = pickle.load(filehandle)
    ix_to_lex = {v: k for k, v in lex_to_ix.items()}
with open(pathRoot + 'normalizer.data', 'rb') as filehandle:
    moments, cols = pickle.load(filehandle)
    moments = dict(zip(cols, list(moments)))

In [None]:
### Preparing training/validation split

np.random.seed(1728)

np.random.shuffle(sequences)  # shuffle before splitting validation set
val_split_ix = int(0.9*len(sequences))
train = []
val = []
for (s, _) in sequences[:val_split_ix]:
    train += s
for (sv, p) in sequences[val_split_ix:]:
    for (x, y, tr, i, mm) in sv:
        if tr == 0: #  no transposition
            val += [(x, y, p, i, mm)]

# sequences = None  # if you need a bit more memory, to allow garbage collection

# uncomment to reduce ds for testing
train = train[:10]
val = val[:3]

#### Define the neural network

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Input, Model
from tensorflow.keras.utils import Sequence

class Attention(layers.Layer):
    def __init__(self, tx):
        super(Attention, self).__init__()
        self.repeat = layers.RepeatVector(tx)
        self.concat = layers.Concatenate(axis=-1)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(10, activation="tanh"), layers.Dense(1, activation = "relu"),
            layers.Softmax()]
        )
        self.project = layers.Dot(axes = 1)

    def call(self, inputs):
        query = self.repeat(inputs[0])
        act = self.concat([query, inputs[1]])
        act = self.ffn(act)
        ctx = self.project([act, inputs[1]])
        return ctx

class Decoder(layers.Layer):
    def __init__(self, tsteps, decoder_units, n_y):
        super(Decoder, self).__init__()
        self.tsteps = tsteps
        self.decoder_units = decoder_units
        self.att = Attention(tsteps)
        self.rnn = layers.LSTM(decoder_units, return_state=True)
        self.ff = tf.keras.Sequential(
            [layers.Dense(10*n_y, activation="tanh"), layers.Dense(n_y)]
        )
        self.reshaper = layers.Reshape((1, n_y))
        self.cat = layers.Concatenate(axis=1)

    def call(self, inputs):
        batch_size = tf.keras.backend.shape(inputs)[0]
        h = tf.zeros((batch_size, self.decoder_units))
        c = tf.zeros((batch_size, self.decoder_units))
        y = []
        for t in range(self.tsteps):
            ctx = self.att([h, inputs])
            h, _, c = self.rnn(ctx, initial_state=[h, c])
            y.append(self.reshaper(self.ff(h)))
        return self.cat(y)
    
def my_model(tx, ty, n_x, n_y, vocab_col, vocab_size, decoder_units):
    X = Input((tx, n_x))
    
    #  Split the input vector between one-hot and numerical features
    mk = list(range(n_x))
    mk.remove(vocab_col)
    
    emb_input = layers.Lambda(lambda x: x[:, :, vocab_col])(X)
    num_input = layers.Lambda(lambda x: tf.gather(x, mk, axis=2))(X)

    #  Compute an embedding vector and combine it with the numeric features
    emb_vec = layers.Embedding(input_dim=vocab_size, output_dim=64, mask_zero=True)(emb_input)
    seq_input = layers.Concatenate(axis=2)([emb_vec, num_input])
    
    #  Encode
    tensor_var = layers.Bidirectional(layers.LSTM(128, return_sequences = True))(seq_input)
    tensor_var = layers.Dropout(0.15)(tensor_var)
    tensor_var = layers.BatchNormalization()(tensor_var)
    
    #  Decode with Bahdanau-style attention
    decoder = Decoder(ty, decoder_units, n_y)
    Y = decoder(tensor_var)
    
    return Model(inputs=X, outputs=Y)

In [None]:
class DataGenerator(Sequence):
    def __init__(self, data, sequence_length, batch_size=BATCH_SIZE, sequence_stride=1,
                 shuffle=True, output_sequence=True, output_cols=None,
                 mini_batch_limit=np.inf):
        self.data = data
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.sequence_stride = sequence_stride
        self.shuffle = shuffle
        self.output_sequence = output_sequence
        self.pad_value = 0.
        self.mini_batch_limit = mini_batch_limit
        self.indexes = []
        if output_cols is None:
            self.output_cols = data[0][1].columns
        else:
            self.output_cols = output_cols
        for si, s in enumerate(data):
            x = s[0]
            tx = x.shape[0]
            xind = 0
            while tx > sequence_length:
                self.indexes.append((si, xind))
                xind += sequence_stride
                tx -= sequence_stride
            self.indexes.append((si,xind))
        np.random.shuffle(self.indexes)  # always shuffle once
        
    def __len__(self):
        return int(np.min([len(self.indexes) / self.batch_size, self.mini_batch_limit]))
    
    def __getitem__(self, index):
        index *= self.batch_size
        this_size = self.batch_size if index + self.batch_size < len(self.indexes) else len(self.indexes) - index
        X = np.zeros((this_size, self.sequence_length, self.data[0][0].shape[1]))
        Y = np.zeros((this_size, self.sequence_length, len(self.output_cols)))
        for i in range(this_size):
            X[i,:,:], Y[i,:,:] = self.__getsingleitem(index + i)
        if self.output_sequence:
            return X, Y
        else:
            return X, Y[:,-1,:]
    
    def __getsingleitem(self, index):
        (seq, stride) = self.indexes[index]
        (X, Y, _, _, _) = self.data[seq]
        Y = Y.loc[:, self.output_cols]
        if stride+self.sequence_length <= X.shape[0]:
            X = X.iloc[stride:stride+self.sequence_length, :].to_numpy(dtype='float64')
            if self.output_sequence:
                Y = Y.iloc[stride:stride+self.sequence_length, :].to_numpy(dtype='float64')
            else:
                Y = Y.iloc[stride+self.sequence_length-1, :].to_numpy(dtype='float64').reshape((1,len(self.output_cols)))
            return X, Y
        else:
            # pad
            X = X.iloc[stride:X.shape[0], :].to_numpy(dtype='float64')
            padX = np.full((self.sequence_length - X.shape[0], X.shape[1]), self.pad_value)
            if self.output_sequence:
                Y = Y.iloc[stride:Y.shape[0], :].to_numpy(dtype='float64')
                padY = np.full((self.sequence_length - Y.shape[0], Y.shape[1]), self.pad_value)
                return np.concatenate((X, padX), axis=0), np.concatenate((Y, padY), axis=0)
            else:
                padY = np.full((1, Y.shape[1]), self.pad_value)
    
    def on_epoch_end(self):
        """Updates indexes after each epoch
        """
        if self.shuffle:
            np.random.shuffle(self.indexes)


In [None]:
generator = DataGenerator(train, seq_length, batch_size=BATCH_SIZE, output_sequence=True, output_cols=output_cols,
                          shuffle=False, mini_batch_limit=20)
val_gen = DataGenerator(val, seq_length, batch_size=BATCH_SIZE, output_sequence=True, output_cols=output_cols,
                        shuffle=False, mini_batch_limit=10)

In [None]:
model = my_model(seq_length, seq_length, train[0][0].shape[1], len(output_cols),
                 train[0][0].columns.get_loc("melody"), len(ix_to_lex) + 3, decoder_units)

opt = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(loss="mse", optimizer=opt)

model.summary()

#### Train the model

In [None]:
model.fit(generator, epochs=1, validation_data=val_gen)

#  Save model
# model.save_weights(pathRoot + '2020-08-31_timing.h5')

#### Results

In [None]:
# Load model
# model.load_weights(pathRoot + '2020-09-03_huge.h5')

#  Compute note-level error

# validation data
test_sequences = val

# test data
# with open(pathRoot + 'note_sequences_test.data', 'rb') as seq_path:
#     test_sequences = pickle.load(seq_path)
#     ts = []
#     for (sv, p) in test_sequences:
#         for (x, y, tr, i, mm) in sv:
#             if tr == 0:
#                 ts.append((x, y, p, i, mm))
#     test_sequences = ts

def evaluation(sequences, sequence_length, model, pad_value=0.):
    Yhat = []
    for S in sequences:
        X = S[0]
        tx = X.shape[0]
        n_x = int(tx / sequence_length)
        n_x += 0 if tx % sequence_length == 0 else 1
        x = np.full((n_x, sequence_length, X.shape[1]), pad_value)
        for i in range(n_x - 1):            
            x[i,:,:] = X.iloc[(i * sequence_length):(i + 1) * sequence_length,:].to_numpy()
        x[n_x - 1,:tx - (n_x - 1) * sequence_length,:] = X.iloc[(n_x - 1) * sequence_length:,:].to_numpy()
        y = model.predict(x)
        print(y.shape)
        Yhat.append(y.reshape((-1,y.shape[2])))
    return Yhat

def sliding_evaluation(sequences, sequence_length, model, pad_value=0., pad_start=True):
    Yhat = []
    for S in sequences:
        X = S[0]
        tx = X.shape[0]
        n_x = tx if pad_start else tx - sequence_length + 1
        x = np.full((n_x, sequence_length, X.shape[1]), pad_value)
        idx = 0
        if pad_start:
            for i in range(0, sequence_length):
                x[i,sequence_length-i-1:,:] = X.iloc[0:i+1,:].to_numpy()
            idx = sequence_length
        else:
            x[0,:,:] = X.iloc[0:sequence_length,:].to_numpy()
            idx = 1
        for i in range(1, tx - sequence_length):
            x[idx,:,:] = X.iloc[i:i+sequence_length,:].to_numpy()
            idx += 1
        y = model.predict(x)
        if y.ndim < 3:  # single timestep prediction
            Yhat.append(y)
        elif pad_start:
            Yhat.append(y[:,-1,:])
        else:
            Yhat.append(np.concatenate((y[0,:,:], y[1:, -1, :])))
    return Yhat

Yhat = evaluation(test_sequences, seq_length, model)
mse = np.zeros((len(test_sequences), Yhat[0].shape[1]))
ms = np.zeros((len(test_sequences), Yhat[0].shape[1]))
for i, (_, Y, _, _, _) in enumerate(test_sequences):
    Y = Y.loc[:,output_cols]
    mse[i,:] = np.mean((Yhat[i][:Y.shape[0],:] - Y) ** 2)
    ms[i,:] = np.mean(Y ** 2)
    
print('Validation set MSE for y_0: ' + str(np.mean(mse[:,0])) + '     mean square val: ' + str(np.mean(ms[:,0])))
print('Minimum y_0 MSE among pieces: ' + str(mse[:,0].min()))

In [None]:
import matplotlib.pyplot as plt

plt.plot(mse[:,0])
plt.plot(ms[:,0])

In [None]:
import matplotlib.pyplot as plt

piece = 0
attr = ['peakLevel']
plt.figure(figsize=(21, 5))
plt.plot(Yhat[piece][:,0])
plt.plot(test_sequences[piece][1].loc[:,attr].to_numpy())
# print(test_sequences[piece][1].columns[attr])
plt.show()

#### Listen to a synthesized predicted expression

In [None]:
import pretty_midi
import IPython.display

test_sequences = val

# piece to synthesize:
pieceNum = 27
pieceId = test_sequences[pieceNum][2]
print(pieceId)

deviations_pred = Yhat[pieceNum][:,0] * test_sequences[pieceNum][4][2,1] + test_sequences[pieceNum][4][2,0]
deviations_perf = test_sequences[pieceNum][1].ioiRatio * test_sequences[pieceNum][4][2,1] + test_sequences[pieceNum][4][2,0]
tempo = test_sequences[pieceNum][1].localTempo.iloc[0] * test_sequences[pieceNum][4][0,1] + test_sequences[pieceNum][4][0,0]
no_dev = [test_sequences[pieceNum][4][2,0]] * test_sequences[pieceNum][1].shape[0]
dev_rand = np.random.normal(size=test_sequences[pieceNum][1].shape[0]) * test_sequences[pieceNum][4][2,1] + test_sequences[pieceNum][4][2,0]
pm = pretty_midi.PrettyMIDI(initial_tempo=60 * tempo)
inst = pretty_midi.Instrument(program=test_sequences[pieceNum][3], is_drum=False, name='melody_inst')
pm.instruments.append(inst)
start = 0.
lastNote = None
for x, y, dev in zip(test_sequences[pieceNum][0].itertuples(), test_sequences[pieceNum][1].itertuples(), deviations_perf):
    (pitch, _) = ix_to_lex[x.melody]
    if lastNote:
        if start < lastNote.end:
            lastNote.end = start
    end = start + (x.duration * moments['duration'][1] + moments['duration'][0]) * dev
    lastNote = pretty_midi.Note(100, pitch, start, end)
    inst.notes.append(lastNote)
    start += (x.ioi * moments['ioi'][1] + moments['ioi'][0]) * dev
IPython.display.Audio(pm.fluidsynth(fs=44100), rate=44100)

### Building conductive input from generated performance

This step uses the predicted timing information to build a local tempo signal which can be used as input in a virtual conductor. That signal is compared to the local tempo vector obtained from the chosen reference performance from the dataset.