Deep artificial neural network for expressive timing and dynamics predictions in musical pieces
---------------

This notebook loads a sequential dataset with score and performance information and uses it to train and test a deep artificial neural network for generating onset timing deviation and peak loudness level of notes from musical pieces.


#### Preparing to install XLA (for training on TPUs) and pytorch-lightning (skip if not using Google Colab):

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py

!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
%%capture
! pip install pytorch_lightning --upgrade

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Parameters to set:

In [None]:
runLocal = True  # set to False for using Google Colab

output_cols = ["peakLevel"]
DEV_RUN = False
SCHEDULER_STEP_SIZE = 4
SCHEDULER_GAMMA = 0.25
LR = 1e-6
SEQ_LEN = 200
HIDDEN_SIZE = 128
DROPOUT = 0.1
EVAL_STRIDE = 160 #int(SEQ_LEN / 2)  # score notes sliding window
EVAL_CTX = 20 #int(EVAL_STRIDE / 2)  # no. of note predictions to ignore in sequence start
PAD_END = True
BATCH_SIZE = 64
NUM_EPOCHS = 8
STATE_DICT_NAME = 'hpc_logs/version_2137133/2021-03-06-hp200-128-lvl.pth'


#### Setting path and loading dataset

In [None]:
import os
import numpy as np
import pandas as pd
import pickle


if runLocal:
    pathRoot = 'data/'
else:
    pathRoot = '/content/drive/My Drive/colab_data/'
    !wget "https://raw.githubusercontent.com/fabiozeh/deep-expression/master/dataloader.py"

    
with open(os.path.join(pathRoot, 'LvB_train_sequences.data'), 'rb') as seq_path:
    train = pickle.load(seq_path)
with open(os.path.join(pathRoot, 'LvB_val_sequences.data'), 'rb') as seq_path:
    val = pickle.load(seq_path)

#### Defining the neural network

In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl

pl.seed_everything(1728)

class Encoder(nn.Module):
    def __init__(self, n_x, vocab_size, embed_size, dropout):
        super(Encoder, self).__init__()
        
        self.pitchEmbedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_size, padding_idx=0)
        self.harmonyRhythmProjector = nn.Linear(in_features=n_x - 1, out_features=embed_size)
        self.drop1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(2*embed_size)
        self.rnn = nn.GRU(2 * embed_size, 2 * embed_size, num_layers=1, bidirectional=True)
        self.drop2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(4*embed_size)
    
    def forward(self, pitch, score_feats, lengths):
        
        pitch = self.pitchEmbedding(pitch)
        score_feats = self.harmonyRhythmProjector(score_feats)
        src_vec = torch.cat([pitch, score_feats], dim=2)
        src_vec = self.norm1(self.drop1(src_vec))
        
        sequence = nn.utils.rnn.pack_padded_sequence(src_vec, lengths.cpu(), enforce_sorted=False)
        output, _ = self.rnn(sequence)
        output, _ = nn.utils.rnn.pad_packed_sequence(output)
        return src_vec, self.norm2(self.drop2(output))
    

class Decoder(nn.Module):
    def __init__(self, n_y, hidden_size, enc_hidden_size, dropout=0.1):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.y_proj = nn.Linear(n_y, hidden_size)
        self.drop1 = nn.Dropout(dropout)
        self.attention = nn.MultiheadAttention(hidden_size, 1, kdim=enc_hidden_size, vdim=enc_hidden_size)
        self.rnn = nn.GRU(2*hidden_size, hidden_size, num_layers=1)
        self.drop2 = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(hidden_size)
        self.ff1 = nn.Linear(4*hidden_size, 2*hidden_size)
        self.drop3 = nn.Dropout(dropout)
        self.ff2 = nn.Linear(2*hidden_size, n_y, bias=False)
        
    def forward(self, x_vec, y_prev, encoder_out, dec_hidden):
        """
        Generates outputs for a single step in the sequence (one note)
        """
        y_projected = self.drop1(self.y_proj(y_prev))  # (1, batch_size, hidden_size)
        
        # shapes: (1, b, h) <- ( (1, b, h), (len_seq, b, enc_hidden), (len_seq, b, enc_hidden) )
        context, _ = self.attention(dec_hidden, encoder_out, encoder_out)
        rnn_out, new_dec_hidden = self.rnn(torch.cat([y_projected, context], dim=2), dec_hidden)
        
        rnn_out = self.norm(self.drop2(rnn_out))
        out = self.ff2(self.drop3(F.relu(self.ff1(torch.cat([rnn_out, y_projected, context, x_vec], dim=2)))))
        return out, new_dec_hidden

class Net(pl.LightningModule):

    def __init__(self, n_x, n_y, vocab_size, hidden_size=64, dropout_rate=0.1, lr=1e-4, context=0, window=0):
        super(Net, self).__init__()
        
        assert hidden_size % 2 == 0, "hidden_size must be multiple of 2"
        
        self.n_y = n_y
        self.hidden_size = hidden_size
        self.lr = lr
        self.window = window
        self.context = context
        self.rng = np.random.default_rng()
        
        self.encoder = Encoder(n_x, vocab_size, int(hidden_size/2), dropout_rate)
        self.decoder = Decoder(self.n_y, hidden_size, 2*hidden_size, dropout_rate)
        
        
    def forward(self, pitch, score_feats, lengths):
        """
        Generate the entire sequence 
        """
        src_vec, encoded_score = self.encoder(pitch, score_feats, lengths)
        hidden = torch.zeros((1, pitch.shape[1], self.hidden_size), device=self.device)
        y = torch.zeros((pitch.shape[0], pitch.shape[1], self.n_y), device=self.device)
        prev_y = torch.zeros((1, pitch.shape[1], self.n_y), device=self.device)
        for i in range(pitch.shape[0]):
            prev_y, hidden = self.decoder(src_vec[i,:,:].unsqueeze(0), prev_y, encoded_score, hidden)
            y[i,:,:] = prev_y
        return y
        
    def training_step(self, batch, batch_idx):
        """
        This method doesn't use self.forward directly so we can apply teacher forcing
        on a fraction of the steps.
        """
        pitch, score_feats, y, lengths = batch
        if len(pitch.shape) < 2:
            pitch = pitch.unsqueeze(1)
            score_feats = score_feats.unsqueeze(1)
            y = y.unsqueeze(1)

        # encode x (score)
        src_vec, encoded_score = self.encoder(pitch, score_feats, lengths)
        
        # iterate generating y
        teacher_forcing_ratio = 0.5
        
        hidden = torch.zeros((1, score_feats.shape[1], self.hidden_size), device=self.device)
        y_hat = torch.zeros((y.shape[0], y.shape[1], self.n_y), device=self.device)
        prev_y = torch.zeros((1, score_feats.shape[1], self.n_y), device=self.device)
        for i in range(pitch.shape[0]):
            prev_y, hidden = self.decoder(src_vec[i,:,:].unsqueeze(0), prev_y, encoded_score, hidden)
            y_hat[i,:,:] = prev_y
            if self.rng.random() > teacher_forcing_ratio:
                prev_y = y[i,:,:].view(1, -1, self.n_y)
        
        if self.window:
            ctx = self.context
            if not ctx:
                ctx = y_hat.shape[0] - self.window
            y_hat = y_hat[ctx:ctx + self.window, :, :]
            y = y[ctx:ctx + self.window, :, :]

        loss =  F.mse_loss(y_hat, y)
#         self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        pitch, score_feats, y, lengths = batch
        if len(pitch.shape) < 2:
            pitch = pitch.unsqueeze(1)
            score_feats = score_feats.unsqueeze(1)
            y = y.unsqueeze(1)

        y_hat = self.forward(pitch, score_feats, lengths)
        
        if self.window:
            ctx = self.context
            if not ctx:
                ctx = y_hat.shape[0] - self.window
            y_hat = y_hat[ctx:ctx + self.window, :, :]
            y = y[ctx:ctx + self.window, :, :]
            
        return {'val_loss': F.mse_loss(y_hat, y)}
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA)
        return [optimizer], [scheduler]


In [None]:
with open(pathRoot + 'LvB_pitch_dict.data', 'rb') as filehandle:
    lex_to_ix = pickle.load(filehandle)
    ix_to_lex = {v: k for k, v in lex_to_ix.items()}
with open(pathRoot + 'LvB_normalizer.data', 'rb') as filehandle:
    moments, cols = pickle.load(filehandle)
    moments = dict(zip(cols, list(moments)))
with open(os.path.join(pathRoot, 'LvB_test_sequences.data'), 'rb') as seq_path:
    test = pickle.load(seq_path)

model = Net(test[0][0][0].shape[1],
            len(output_cols),
            vocab_size=len(ix_to_lex) + 4,  # 0 = pad, len+1 = UKN, len+2 = END, len+3 = SOS
            hidden_size=HIDDEN_SIZE,
            dropout_rate=DROPOUT,
            lr=LR,
            context=(EVAL_CTX if PAD_END else 0),
            window=(EVAL_STRIDE if PAD_END else 0))

## Train the model

In [None]:
from torch.utils.data import DataLoader
import dataloader as dl

if runLocal:
    trainer = pl.Trainer(max_epochs=NUM_EPOCHS, fast_dev_run=DEV_RUN, val_check_interval=0.25)
    workers = 4
else:
    trainer = pl.Trainer(gpus=1, accelerator='dp', fast_dev_run=DEV_RUN,
                         progress_bar_refresh_rate=20, max_epochs=NUM_EPOCHS,
                         val_check_interval=0.25)
    workers = 0

if SEQ_LEN == 0:
    train_ds = dl.FullPieceDataset(train, 
                                   vocab_col=test[0][0][0].columns.get_loc("pitch"),
                                   output_cols=output_cols)
    val_ds = dl.FullPieceDataset(val, 
                                 vocab_col=test[0][0][0].columns.get_loc("pitch"),
                                 output_cols=output_cols)
else:
    train_ds = dl.TrainDataset(train, 
                               vocab_col=test[0][0][0].columns.get_loc("pitch"),
                               sequence_length=SEQ_LEN,
                               output_cols=output_cols,
                               context=EVAL_CTX,
                               dummy=DEV_RUN)
    val_ds = dl.ValidationDataset(val, 
                                  vocab_col=test[0][0][0].columns.get_loc("pitch"),
                                  sequence_length=SEQ_LEN,
                                  output_cols=output_cols,
                                  stride=EVAL_STRIDE,
                                  context=EVAL_CTX,
                                  pad_both_ends=PAD_END,
                                  device=model.device)
trainer.fit(model, 
            DataLoader(train_ds,
                       batch_size=BATCH_SIZE,
                       num_workers = workers,
                       shuffle=True,
                       collate_fn=dl.TrainDataset.collate_fn),
            DataLoader(val_ds,
                       batch_size=None,
                       num_workers=workers))

#  Save model
torch.save(model.state_dict(), pathRoot + STATE_DICT_NAME)

In [None]:
torch.save(model.state_dict(), pathRoot + STATE_DICT_NAME)

In [None]:
# Start tensorboard.
%reload_ext tensorboard
%tensorboard --logdir lightning_logs/

#### Results

In [None]:
from torch.utils.data import DataLoader
import dataloader as dl


# Load model
model.load_state_dict(torch.load(STATE_DICT_NAME))

model.eval()

#  Compute note-level error

def evaluation(sequences, sequence_length, model, stride=0, context=0):
    # context = no. of note predictions to ignore in sequence start
    loader = dl.ValidationDataset(sequences,
                                  vocab_col=sequences[0][0][0].columns.get_loc("pitch"),
                                  sequence_length=sequence_length,
                                  output_cols=output_cols,
                                  stride=stride,
                                  context=context,
                                  pad_both_ends=PAD_END,
                                  device=model.device)
    Y_hat = []
    for piece in range(len(loader)):
        (pch, s_f, Y, lth) = loader[piece]
        out = model(pch, s_f, lth)
        out = out.detach().numpy()
        y_hat_p = np.zeros((sequences[piece][0][1].shape[0], len(output_cols)))
        ind = 0
        for s in range(out.shape[1] - 1):
            y_hat_p[ind:ind + stride, :] = out[context:context + stride, s, :]
            ind += stride
        y_hat_p[ind:, :] = out[context:context + y_hat_p.shape[0] - ind, -1, :]
        Y_hat.append(y_hat_p)
    return Y_hat

Yhat = evaluation(val, SEQ_LEN, model, stride=EVAL_STRIDE, context=EVAL_CTX)
mse = np.zeros((len(val), Yhat[0].shape[1]))
ms = np.zeros((len(val), Yhat[0].shape[1]))
for i, S in enumerate(val):
    Y = S[0][1]
    Y = Y.loc[:,output_cols]
    mse[i,:] = np.mean((Yhat[i][:Y.shape[0],:] - Y) ** 2)
    ms[i,:] = np.mean(Y ** 2)
    
print('Validation set MSE for y_0: ' + str(np.mean(mse[:,0])) + '     mean square val: ' + str(np.mean(ms[:,0])))
print('Minimum y_0 MSE among pieces: ' + str(mse[:,0].min()))

In [None]:
import matplotlib.pyplot as plt

piece = 0
attr = ['peakLevel']
plt.figure(figsize=(21, 5))
plt.plot(Yhat[piece][:200,0])
plt.plot(test[piece][0][1].loc[:,attr].to_numpy()[:200])
# print(test_sequences[piece][1].columns[attr])
plt.show()

#### Comparison of dynamics of different performances of same piece for context:

In [None]:
plt.figure(figsize=(21, 5))
plt.plot(train[35][0][1].loc[:,attr].to_numpy()[2500:])
plt.plot(train[70][0][1].loc[:,attr].to_numpy()[2500:])
plt.figure(figsize=(21, 5))
plt.plot(train[42][0][1].loc[:,attr].to_numpy()[1000:1300])
plt.plot(train[77][0][1].loc[:,attr].to_numpy()[1000:1300])

In [None]:
mse_human1 = np.mean((train[70][0][1].loc[:, 'peakLevel'].iloc[:3120].to_numpy('float64') - train[35][0][1].loc[:, 'peakLevel'].iloc[:3120].to_numpy('float64')) ** 2)
mse_human2 = np.mean((train[42][0][1].loc[:, 'peakLevel'].to_numpy('float64') - train[77][0][1].loc[:, 'peakLevel'].to_numpy('float64')) ** 2)

print("MSE between two performances of sonata 7, 2nd mvmt.: " + str(mse_human1))
print("MSE between two performances of sonata 7, 3rd mvmt.: " + str(mse_human2))

#### Listen to a piece synthesized with the generated expression

In [None]:
import pretty_midi
import IPython.display
import expression_modeling as m

# piece to synthesize:
pieceNum = 0
pieceId = val[pieceNum][1]
print(pieceId)

pred = Yhat[pieceNum][:,0]
ref = val[pieceNum][0][1].ioiRatio
no_dev = np.asarray([val[pieceNum][0][2][2,0]] * val[pieceNum][0][1].shape[0])
dev_rand = np.random.normal(size=val[pieceNum][0][1].shape[0]) * val[pieceNum][0][2][2,1] + val[pieceNum][0][2][2,0]

pm = m.midi_performance(val[pieceNum][0], pred, moments, ix_to_lex, method='ioiRatio')
IPython.display.Audio(pm.fluidsynth(fs=44100), rate=44100)