Deep artificial neural network for expressive timing and dynamics predictions in musical pieces
---------------

This notebook loads a sequential dataset with score and performance information and uses it to train and test a deep artificial neural network for generating onset timing deviation and peak loudness level of notes from musical pieces.


#### Preparing to install XLA (for training on TPUs) and pytorch-lightning (skip if not using Google Colab):

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py

!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
%%capture
! pip install pytorch_lightning --upgrade

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Setting path and loading dataset

In [1]:
import os
import numpy as np
import pandas as pd
import pickle

### Parameters to set:

runLocal = True  # set to False for using Google Colab

if runLocal:
    pathRoot = 'data/'
else:
    pathRoot = '/content/drive/My Drive/colab_data/'
    
    
with open(os.path.join(pathRoot, 'LvB_train_sequences.data'), 'rb') as seq_path:
    train = pickle.load(seq_path)
with open(os.path.join(pathRoot, 'LvB_val_sequences.data'), 'rb') as seq_path:
    val = pickle.load(seq_path)

#### Defining the neural network

In [2]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl

pl.seed_everything(1728)

class Encoder(nn.Module):
    def __init__(self, n_x, vocab_size, embed_size, dropout):
        super(Encoder, self).__init__()
        
        self.pitchEmbedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_size, padding_idx=0)
        self.harmonyRhythmProjector = nn.Linear(in_features=n_x - 1, out_features=embed_size)
        self.drop1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(2*embed_size)
        self.rnn = nn.GRU(2 * embed_size, 2 * embed_size, num_layers=1, bidirectional=True)
        self.drop2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(4*embed_size)
    
    def forward(self, pitch, harmRhythm, lengths):
        
        pitch = self.pitchEmbedding(pitch)
        harmRhythm = self.harmonyRhythmProjector(harmRhythm)
        src_vec = torch.cat([pitch, harmRhythm], dim=2)
        src_vec.type_as(harmRhythm)
        src_vec = self.norm1(self.drop1(src_vec))
        sequence = nn.utils.rnn.pack_padded_sequence(src_vec, lengths, enforce_sorted=False)
        output, _ = self.rnn(sequence)
        output, _ = nn.utils.rnn.pad_packed_sequence(output)
        return self.norm2(self.drop2(output))
    

class Decoder(nn.Module):
    def __init__(self, n_y, hidden_size, enc_hidden_size, dropout=0.1):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.y_proj = nn.Linear(n_y, hidden_size)
        self.drop1 = nn.Dropout(dropout)
        self.attention = nn.MultiheadAttention(hidden_size, 1, kdim=enc_hidden_size, vdim=enc_hidden_size)
        self.rnn = nn.GRU(2*hidden_size, hidden_size, num_layers=1)
        self.drop2 = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(hidden_size)
        self.ff1 = nn.Linear(3*hidden_size, 2*hidden_size)
        self.drop3 = nn.Dropout(dropout)
        self.ff2 = nn.Linear(2*hidden_size, n_y, bias=False)
        
    def forward(self, y_prev, encoder_out, dec_hidden):
        """
        Generates outputs for a single step in the sequence (one note)
        """
        y_projected = self.drop1(self.y_proj(y_prev))  # (1, batch_size, hidden_size)
        
        # shapes: (1, b, h) <- ( (1, b, h), (len_seq, b, enc_hidden), (len_seq, b, enc_hidden) )
        context, _ = self.attention(dec_hidden, encoder_out, encoder_out)
        rnn_out, new_dec_hidden = self.rnn(torch.cat([y_projected, context], dim=2), dec_hidden)
        
        rnn_out = self.norm(self.drop2(rnn_out))
        out = self.ff2(self.drop3(F.relu(self.ff1(torch.cat([rnn_out, y_projected, context], dim=2)))))
        return out, new_dec_hidden

class Net(pl.LightningModule):

    def __init__(self, n_x, n_y, vocab_size, hidden_size=64, dropout_rate=0.1, lr=1e-4):
        super(Net, self).__init__()
        
        assert hidden_size % 2 == 0, "hidden_size must be multiple of 2"
        
        self.n_y = n_y
        self.hidden_size = hidden_size
        self.lr = lr
        self.rng = np.random.default_rng()
        
        self.encoder = Encoder(n_x, vocab_size, int(hidden_size/2), dropout_rate)
        self.decoder = Decoder(self.n_y, hidden_size, 2*hidden_size, dropout_rate)
        
        
    def forward(self, pitch, harmRhythm, lengths):
        """
        Generate the entire sequence 
        """
        encoded_score = self.encoder(pitch, harmRhythm, lengths)
        hidden = torch.zeros((1, pitch.shape[1], self.hidden_size))
        hidden.type_as(harmRhythm)
        y = torch.zeros((pitch.shape[0], pitch.shape[1], self.n_y))
        y.type_as(harmRhythm)
        prev_y = torch.zeros((1, pitch.shape[1], self.n_y))
        prev_y.type_as(harmRhythm)
        for i in range(pitch.shape[0]):
            prev_y, hidden = self.decoder(prev_y, encoded_score, hidden)
            y[i,:,:] = prev_y
        return y
        
    def training_step(self, batch, batch_idx):
        """
        This method doesn't use self.forward directly so we can apply teacher forcing
        on a fraction of the steps.
        """
        pitch, harmRhythm, y, lengths = batch
        # encode x (score)
        encoded_score = self.encoder(pitch, harmRhythm, lengths)
        
        # iterate generating y
        teacher_forcing_ratio = 0.5
        
        hidden = torch.zeros((1, harmRhythm.shape[1], self.hidden_size))
        hidden.type_as(harmRhythm)
        y_hat = torch.zeros((y.shape[0], y.shape[1], self.n_y))
        y_hat.type_as(y)
        prev_y = torch.zeros((1, harmRhythm.shape[1], self.n_y))
        prev_y.type_as(harmRhythm)
        for i in range(pitch.shape[0]):
            prev_y, hidden = self.decoder(prev_y, encoded_score, hidden)
            y_hat[i,:,:] = prev_y
            if self.rng.random() > teacher_forcing_ratio:
                prev_y = y[i,:,:].view(1, -1, self.n_y)
        loss =  F.mse_loss(y_hat, y)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        pitch, harmRhythm, y, lengths = batch
        
        y_hat = self.forward(pitch, harmRhythm, lengths)
        return {'val_loss': F.mse_loss(y_hat, y)}
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.25)
        return [optimizer], [scheduler]


In [3]:
with open(pathRoot + 'LvB_pitch_dict.data', 'rb') as filehandle:
    lex_to_ix = pickle.load(filehandle)
    ix_to_lex = {v: k for k, v in lex_to_ix.items()}
with open(pathRoot + 'LvB_normalizer.data', 'rb') as filehandle:
    moments, cols = pickle.load(filehandle)
    moments = dict(zip(cols, list(moments)))
with open(os.path.join(pathRoot, 'LvB_test_sequences.data'), 'rb') as seq_path:
    test = pickle.load(seq_path)

output_cols = ['ioiRatio', 'peakLevel']

model = Net(test[0][0][0].shape[1],
            len(output_cols),
            vocab_size=len(ix_to_lex) + 3,
            hidden_size=32,
            dropout_rate=0.1,
            lr=3e-3)

#### Train the model

In [4]:
if not runLocal:
    !wget "https://raw.githubusercontent.com/fabiozeh/deep-expression/master/dataloader.py"

from torch.utils.data import DataLoader
import dataloader as dl

if runLocal:
    trainer = pl.Trainer(max_epochs=1, fast_dev_run=True)
    workers = 4
else:
    trainer = pl.Trainer(tpu_cores=8, progress_bar_refresh_rate=20, max_epochs=1)
    workers = 8

trainer.fit(model, 
            DataLoader(dl.DataGenerator(train, 
                                        vocab_col=test[0][0][0].columns.get_loc("pitch"),
                                        sequence_length=200,
                                        output_cols=output_cols),
                       batch_size=32,
                       shuffle=True,
                       num_workers=workers,
                       collate_fn=dl.DataGenerator.collate_fn),
            DataLoader(dl.DataGenerator(val, 
                                        vocab_col=test[0][0][0].columns.get_loc("pitch"),
                                        sequence_length=200,
                                        output_cols=output_cols),
                       batch_size=32,
                       num_workers=workers,
                       collate_fn=dl.DataGenerator.collate_fn))

#  Save model
torch.save(model.state_dict(), pathRoot + '2021-01-29-test0.pth')

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Running in fast_dev_run mode: will run a full train, val and test loop using a single batch

  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 14 K  
1 | decoder | Decoder | 22 K  


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…




In [None]:
# Start tensorboard.
%reload_ext tensorboard
%tensorboard --logdir lightning_logs/

#### Results

In [None]:
# Load model
# model.load_state_dict(torch.load(pathRoot + '2021-01-11-test0.pth'))

model.eval()

#  Compute note-level error

def evaluation(sequences, sequence_length, model, pad_value=0.):
    Yhat = []
    for S in sequences:
        X = S[0][0]
        tx = X.shape[0]
        n_x = int(tx / sequence_length)
        n_x += 0 if tx % sequence_length == 0 else 1
        x = np.full((sequence_length, n_x, X.shape[1]), pad_value)
        for i in range(n_x - 1):            
            x[:,i,:] = X.iloc[(i * sequence_length):(i + 1) * sequence_length,:].to_numpy()
        x[:tx - (n_x - 1) * sequence_length,n_x - 1,:] = X.iloc[(n_x - 1) * sequence_length:,:].to_numpy()
        y = model(x, n_x*[sequence_length])
        print(y.shape)
        Yhat.append(y.detach().numpy().reshape((-1,y.shape[2])))
    return Yhat

def sliding_evaluation(sequences, sequence_length, model, pad_value=0., pad_start=True):
    Yhat = []
    for S in sequences:
        X = S[0]
        tx = X.shape[0]
        n_x = tx if pad_start else tx - sequence_length + 1
        x = np.full((n_x, sequence_length, X.shape[1]), pad_value)
        idx = 0
        if pad_start:
            for i in range(0, sequence_length):
                x[i,sequence_length-i-1:,:] = X.iloc[0:i+1,:].to_numpy()
            idx = sequence_length
        else:
            x[0,:,:] = X.iloc[0:sequence_length,:].to_numpy()
            idx = 1
        for i in range(1, tx - sequence_length):
            x[idx,:,:] = X.iloc[i:i+sequence_length,:].to_numpy()
            idx += 1
        y = model(x)
        if pad_start:
            Yhat.append(y[:,-1,:])
        else:
            Yhat.append(np.concatenate((y[0,:,:], y[1:, -1, :])))
    return Yhat

Yhat = evaluation(test, 200, model)
mse = np.zeros((len(test), Yhat[0].shape[1]))
ms = np.zeros((len(test), Yhat[0].shape[1]))
for i, S in enumerate(test):
    Y = S[0][1]
    Y = Y.loc[:,output_cols]
    mse[i,:] = np.mean((Yhat[i][:Y.shape[0],:] - Y) ** 2)
    ms[i,:] = np.mean(Y ** 2)
    
print('Validation set MSE for y_0: ' + str(np.mean(mse[:,0])) + '     mean square val: ' + str(np.mean(ms[:,0])))
print('Minimum y_0 MSE among pieces: ' + str(mse[:,0].min()))

In [None]:
import matplotlib.pyplot as plt

plt.plot(mse[:,0])
plt.plot(ms[:,0])

In [None]:
import matplotlib.pyplot as plt

piece = 0
attr = ['peakLevel']
plt.figure(figsize=(21, 5))
plt.plot(Yhat[piece][:,0])
plt.plot(test[piece][0][1].loc[:,attr].to_numpy())
# print(test_sequences[piece][1].columns[attr])
plt.show()

#### Listen to a synthesized predicted expression

In [None]:
import pretty_midi
import IPython.display

test_sequences = val

# piece to synthesize:
pieceNum = 27
pieceId = test[pieceNum][2]
print(pieceId)

deviations_pred = Yhat[pieceNum][:,0] * test[pieceNum][0][2][2,1] + test[pieceNum][0][2][2,0]
deviations_perf = test[pieceNum][0][1].ioiRatio * test[pieceNum][0][2][2,1] + test[pieceNum][0][2][2,0]
tempo = test[pieceNum][0][1].localTempo.iloc[0] * test[pieceNum][0][2][0,1] + test[pieceNum][0][2][0,0]
no_dev = [test[pieceNum][0][2][2,0]] * test[pieceNum][0][1].shape[0]
dev_rand = np.random.normal(size=test[pieceNum][0][1].shape[0]) * test[pieceNum][0][2][2,1] + test[pieceNum][0][2][2,0]
pm = pretty_midi.PrettyMIDI(initial_tempo=60 * tempo)
piano = pretty_midi.Instrument(1, is_drum=False, name='piano')
violin = pretty_midi.Instrument(41, is_drum=False, name='violin')
pm.instruments.append(piano)
pm.instruments.append(violin)
start = 0.
prev_note = None
for x, y, dev in zip(test[pieceNum][0][0].itertuples(), test_sequences[pieceNum][0][1].itertuples(), deviations_perf):
    (pitch, _) = ix_to_lex[x.pitch]
    start += (x.beatDiff * model.moments['beatDiff'][1] + model.moments['beatDiff'][0]) * dev
    end = start + (x.duration * model.moments['duration'][1] + model.moments['duration'][0]) * dev
    prev_note = pretty_midi.Note(100, pitch, start, end)
    if x.instrument == 'instrument_1':
        piano.notes.append(prev_note)
    else:
        violin.notes.append(prev_note)
    
IPython.display.Audio(pm.fluidsynth(fs=44100), rate=44100)

### Building conductive input from generated performance

This step uses the predicted timing information to build a local tempo signal which can be used as input in a virtual conductor. That signal is compared to the local tempo vector obtained from the chosen reference performance from the dataset.