Deep artificial neural network for expressive timing and dynamics predictions in musical pieces
---------------

This notebook loads a sequential dataset with score and performance information and uses it to train and test a deep artificial neural network for generating onset timing deviation and peak loudness level of notes from musical pieces.


#### Preparing to install XLA (for training on TPUs) and pytorch-lightning (skip if not using Google Colab):

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py

!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
%%capture
! pip install pytorch_lightning --upgrade

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Parameters to set:

In [None]:
runLocal = True  # set to False for using Google Colab

output_cols = ["peakLevel"]
DEV_RUN = False
SCHEDULER_STEP_SIZE = 4
SCHEDULER_GAMMA = 0.25
LR = 1e-6
SEQ_LEN = 200
HIDDEN_SIZE = 128
DROPOUT = 0.1
EVAL_STRIDE = 160 #int(SEQ_LEN / 2)  # score notes sliding window
EVAL_CTX = 20 #int(EVAL_STRIDE / 2)  # no. of note predictions to ignore in sequence start
PAD_END = True
BATCH_SIZE = 64
NUM_EPOCHS = 8
STATE_DICT_NAME = 'hpc_logs/version_2137133/2021-03-06-hp200-128-lvl.pth'


#### Setting path and loading dataset

In [None]:
import os
import numpy as np
import pandas as pd
import pickle


if runLocal:
    pathRoot = 'data/'
else:
    pathRoot = '/content/drive/My Drive/colab_data/'
    !wget "https://raw.githubusercontent.com/fabiozeh/deep-expression/master/dataloader.py"
    !wget "https://raw.githubusercontent.com/fabiozeh/deep-expression/master/seq2seq.py"

    
with open(os.path.join(pathRoot, 'LvB_train_sequences.data'), 'rb') as seq_path:
    train = pickle.load(seq_path)
with open(os.path.join(pathRoot, 'LvB_val_sequences.data'), 'rb') as seq_path:
    val = pickle.load(seq_path)

#### Defining the neural network

In [None]:
import pytorch_lightning as pl
import seq2seq

with open(pathRoot + 'LvB_pitch_dict.data', 'rb') as filehandle:
    lex_to_ix = pickle.load(filehandle)
    ix_to_lex = {v: k for k, v in lex_to_ix.items()}
with open(pathRoot + 'LvB_normalizer.data', 'rb') as filehandle:
    moments, cols = pickle.load(filehandle)
    moments = dict(zip(cols, list(moments)))
with open(os.path.join(pathRoot, 'LvB_test_sequences.data'), 'rb') as seq_path:
    test = pickle.load(seq_path)

model = seq2seq.Net(test[0][0][0].shape[1],
            len(output_cols),
            vocab_size=len(ix_to_lex) + 4,  # 0 = pad, len+1 = UKN, len+2 = END, len+3 = SOS
            hidden_size=HIDDEN_SIZE,
            dropout_rate=DROPOUT,
            lr=LR,
            context=(EVAL_CTX if PAD_END else 0),
            window=(EVAL_STRIDE if PAD_END else 0))

## Train the model

In [None]:
from torch.utils.data import DataLoader
import dataloader as dl

if runLocal:
    trainer = pl.Trainer(max_epochs=NUM_EPOCHS, fast_dev_run=DEV_RUN, val_check_interval=0.25)
    workers = 4
else:
    trainer = pl.Trainer(gpus=1, accelerator='dp', fast_dev_run=DEV_RUN,
                         progress_bar_refresh_rate=20, max_epochs=NUM_EPOCHS,
                         val_check_interval=0.25)
    workers = 0

if SEQ_LEN == 0:
    train_ds = dl.FullPieceDataset(train, 
                                   vocab_col=test[0][0][0].columns.get_loc("pitch"),
                                   output_cols=output_cols)
    val_ds = dl.FullPieceDataset(val, 
                                 vocab_col=test[0][0][0].columns.get_loc("pitch"),
                                 output_cols=output_cols)
else:
    train_ds = dl.TrainDataset(train, 
                               vocab_col=test[0][0][0].columns.get_loc("pitch"),
                               sequence_length=SEQ_LEN,
                               output_cols=output_cols,
                               context=EVAL_CTX,
                               dummy=DEV_RUN)
    val_ds = dl.ValidationDataset(val, 
                                  vocab_col=test[0][0][0].columns.get_loc("pitch"),
                                  sequence_length=SEQ_LEN,
                                  output_cols=output_cols,
                                  stride=EVAL_STRIDE,
                                  context=EVAL_CTX,
                                  pad_both_ends=PAD_END,
                                  device=model.device)
trainer.fit(model, 
            DataLoader(train_ds,
                       batch_size=BATCH_SIZE,
                       num_workers = workers,
                       shuffle=True,
                       collate_fn=dl.TrainDataset.collate_fn),
            DataLoader(val_ds,
                       batch_size=None,
                       num_workers=workers))

#  Save model
torch.save(model.state_dict(), pathRoot + STATE_DICT_NAME)

#### Results

In [None]:
import torch
from torch.utils.data import DataLoader
import dataloader as dl


# Load model
model.load_state_dict(torch.load(STATE_DICT_NAME))

model.eval()

Yhat, mse = seq2seq.evaluation(val, SEQ_LEN, model, stride=EVAL_STRIDE, output_cols=output_cols,
                               context=EVAL_CTX, pad_both_ends=PAD_END)
    
for i, col in enumerate(output_cols):
    print('Validation set MSE for ' + col + ': ' + str(np.mean(mse[:, i])))
    print('Minimum MSE among pieces for ' + col + ': ' + str(mse[:, i].min()))

In [None]:
import matplotlib.pyplot as plt

piece = 0
attr = ['peakLevel']
plt.figure(figsize=(21, 5))
plt.plot(Yhat[piece][:200,0])
plt.plot(test[piece][0][1].loc[:,attr].to_numpy()[:200])
# print(test_sequences[piece][1].columns[attr])
plt.show()

#### Comparison of dynamics of different performances of same piece for context:

In [None]:
plt.figure(figsize=(21, 5))
plt.plot(train[35][0][1].loc[:,attr].to_numpy()[2500:])
plt.plot(train[70][0][1].loc[:,attr].to_numpy()[2500:])
plt.figure(figsize=(21, 5))
plt.plot(train[42][0][1].loc[:,attr].to_numpy()[1000:1300])
plt.plot(train[77][0][1].loc[:,attr].to_numpy()[1000:1300])

In [None]:
mse_human1 = np.mean((train[70][0][1].loc[:, 'peakLevel'].iloc[:3120].to_numpy('float64') - train[35][0][1].loc[:, 'peakLevel'].iloc[:3120].to_numpy('float64')) ** 2)
mse_human2 = np.mean((train[42][0][1].loc[:, 'peakLevel'].to_numpy('float64') - train[77][0][1].loc[:, 'peakLevel'].to_numpy('float64')) ** 2)

print("MSE between two performances of sonata 7, 2nd mvmt.: " + str(mse_human1))
print("MSE between two performances of sonata 7, 3rd mvmt.: " + str(mse_human2))

#### Listen to a piece synthesized with the generated expression

In [None]:
import pretty_midi
import IPython.display
import expression_modeling as m

# piece to synthesize:
pieceNum = 0
pieceId = val[pieceNum][1]
print(pieceId)

pred = Yhat[pieceNum][:,0]
ref = val[pieceNum][0][1].ioiRatio
no_dev = np.asarray([val[pieceNum][0][2][2,0]] * val[pieceNum][0][1].shape[0])
dev_rand = np.random.normal(size=val[pieceNum][0][1].shape[0]) * val[pieceNum][0][2][2,1] + val[pieceNum][0][2][2,0]

pm = m.midi_performance(val[pieceNum][0], pred, moments, ix_to_lex, method='ioiRatio')
IPython.display.Audio(pm.fluidsynth(fs=44100), rate=44100)