### This notebook is to build modified MIDI-VAE with right and left rolls. 

In [1]:
from music21 import *
import numpy as np
from fractions import Fraction
import os
import random
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

## Declare three index dictionary 

In [2]:
tdict = {0.25: 0,
 0.5: 1,
 0.75: 2,
 1.0: 3,
 1.25: 4,
 1.5: 5,
 1.75: 6,
 2.0: 7,
 2.25: 8,
 2.5: 9,
 2.75: 10,
 3.0: 11,
 3.25: 12,
 4.0: 13,
 5.0: 14,
 6.0: 15,
 3.5: 16,
 3.75: 17,
 4.25: 18,
 4.5: 19,
 4.75: 20,
 5.25: 21,
 5.5: 22,
 7.0: 23,
 Fraction(1, 6): 24,
 Fraction(5, 12): 25,
 Fraction(2, 3): 26,
 Fraction(5, 3): 27,
 Fraction(23, 12): 28,
 Fraction(8, 3): 29,
 Fraction(11, 3): 30,
 Fraction(1, 3): 31,
 Fraction(1, 12): 32,
 Fraction(5, 6): 33,
 Fraction(4, 3): 34,
 Fraction(7, 3): 35,
 Fraction(10, 3): 36}

pdict = {24: 0,
 25: 1,
 26: 2,
 27: 3,
 28: 4,
 29: 5,
 30: 6,
 31: 7,
 32: 8,
 33: 9,
 34: 10,
 35: 11,
 36: 12,
 37: 13,
 38: 14,
 39: 15,
 40: 16,
 41: 17,
 42: 18,
 43: 19,
 44: 20,
 45: 21,
 46: 22,
 47: 23,
 48: 24,
 49: 25,
 50: 26,
 51: 27,
 52: 28,
 53: 29,
 54: 30,
 55: 31,
 56: 32,
 57: 33,
 58: 34,
 59: 35,
 60: 36,
 61: 37,
 62: 38,
 63: 39,
 64: 40,
 65: 41,
 66: 42,
 67: 43,
 68: 44,
 69: 45,
 70: 46,
 71: 47,
 72: 48,
 73: 49,
 74: 50,
 75: 51,
 76: 52,
 77: 53,
 78: 54,
 79: 55,
 80: 56,
 81: 57,
 82: 58,
 83: 59,
 84: 60,
 85: 61,
 86: 62,
 87: 63,
 88: 64,
 89: 65,
 90: 66,
 91: 67,
 92: 68,
 93: 69,
 94: 70,
 95: 71,
 96: 72,
 97: 73,
 98: 74,
 99: 75,
 100: 76,
 101: 77}

vdict = {20: 0,
 21: 1,
 22: 2,
 23: 3,
 24: 4,
 25: 5,
 26: 6,
 27: 7,
 28: 8,
 29: 9,
 30: 10,
 31: 11,
 32: 12,
 33: 13,
 34: 14,
 35: 15,
 36: 16,
 37: 17,
 38: 18,
 39: 19,
 40: 20,
 41: 21,
 42: 22,
 43: 23,
 44: 24,
 45: 25,
 46: 26,
 47: 27,
 48: 28,
 49: 29,
 50: 30,
 51: 31,
 52: 32,
 53: 33,
 54: 34,
 55: 35,
 56: 36,
 57: 37,
 58: 38,
 59: 39,
 60: 40,
 61: 41,
 62: 42,
 63: 43,
 64: 44,
 65: 45,
 66: 46,
 67: 47,
 68: 48,
 69: 49,
 70: 50,
 71: 51,
 72: 52,
 73: 53,
 74: 54,
 75: 55,
 76: 56,
 77: 57,
 78: 58,
 79: 59,
 80: 60,
 81: 61,
 82: 62,
 83: 63,
 84: 64,
 85: 65,
 86: 66,
 87: 67,
 88: 68,
 89: 69,
 90: 70,
 91: 71,
 92: 72,
 93: 73,
 94: 74,
 95: 75}

id2tdict = {i:t for t,i in tdict.items()}
id2pdict = {i:p for p,i in pdict.items()}
id2vdict = {i:v for v,i in vdict.items()}

## Generate the Dataset

In [3]:
class MusicDataset(torch.utils.data.Dataset):

    def __init__(self, right, left, Y, transform = None):
        self.right = right
        self.left = left
        self.Y = Y
        self.transform = transform

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        return self.right[idx], self.left[idx], self.Y[idx]
    

def generate_data_loader(right_data, left_data, labels, batch_size):
    
    X = np.arange(right_data.shape[0])
    y = labels
    X_trainval, X_test, y_trainval, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=23)
    
    # Split train into train-val
    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval, test_size=0.2, stratify=y_trainval, random_state=42)

    right_train = right_data[X_train]
    right_val = right_data[X_val]
    right_test = right_data[X_test]
    
    left_train = left_data[X_train]
    left_val = left_data[X_val]
    left_test = left_data[X_test]    
    
    train_data = MusicDataset(torch.from_numpy(right_train).float(), 
                              torch.from_numpy(left_train).float(), 
                             torch.from_numpy(y_train).float())
    val_data = MusicDataset(torch.from_numpy(right_val).float(), 
                            torch.from_numpy(left_val).float(),
                           torch.from_numpy(y_val).float())
    test_data = MusicDataset(torch.from_numpy(right_test).float(),
                             torch.from_numpy(left_test).float(),
                            torch.from_numpy(y_test).float())
        
    train_loader = torch.utils.data.DataLoader(
        train_data, batch_size=batch_size, shuffle=False)
    val_loader = torch.utils.data.DataLoader(
        val_data, batch_size=batch_size, shuffle=False)
    test_loader = torch.utils.data.DataLoader(
        test_data, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader

## Build MIDI-VAE

In [4]:
class MidiVAE(nn.Module):
    def __init__(self, input_size, lstm_hidden, enc_hidden, z_dim):
        super(MidiVAE, self).__init__()
        
        self.z_dim = z_dim
        
        self.lstm_enc_right = torch.nn.LSTM(
                batch_first = True,
                input_size = input_size,
                hidden_size = lstm_hidden,
                num_layers = 1,
                bidirectional = False)
        
        self.lstm_enc_left = torch.nn.LSTM(
                batch_first = True,
                input_size = input_size,
                hidden_size = lstm_hidden,
                num_layers = 1,
                bidirectional = False)
        
        self.fc_enc1 = nn.Linear(lstm_hidden*2*note_num, enc_hidden*note_num)
        self.fc_enc2 = nn.Linear(enc_hidden*note_num, enc_hidden*note_num)
        self.dropout_enc = nn.Dropout(p=0.5)
        
        self.fc_mu = nn.Linear(enc_hidden*(note_num//2), 3*self.z_dim)
        self.fc_var = nn.Linear(enc_hidden*(note_num//2), 3*self.z_dim)
        
        self.fc_dec1 = nn.Linear(self.z_dim, 3*self.z_dim)
        self.fc_dec2 = nn.Linear(3*self.z_dim, enc_hidden*(note_num))
        
        self.fc_r = nn.Linear(enc_hidden*(note_num//2), lstm_hidden*note_num)
        self.fc_l = nn.Linear(enc_hidden*(note_num//2), lstm_hidden*note_num)
        self.dropout_dec = nn.Dropout(p=0.3)
        
        self.lstm_dec_right = torch.nn.LSTM(
                batch_first = True,
                input_size = lstm_hidden,
                hidden_size = input_size,
                num_layers = 1,
                bidirectional = False)
        
        self.lstm_dec_left = torch.nn.LSTM(
                batch_first = True,
                input_size = lstm_hidden,
                hidden_size = input_size,
                num_layers = 1,
                bidirectional = False)
        
        
    def encoder(self, right, left):
        x1,_ = self.lstm_enc_right(right)
        x2,_ = self.lstm_enc_left(left)
        del _
        
        # concat
        x = torch.cat((x1, x2), dim=2)
        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.fc_enc1(x))
        x = F.relu(self.fc_enc2(x))

        x = self.dropout_enc(x)
        x1, x2 = torch.split(x, [x.shape[1]//2, x.shape[1]//2], dim=1)
        
        return self.fc_mu(x1), self.fc_var(x2)
    
    
    def reparameterize(self, mu, logvar, labels):

#         base = self.z_dim//3
#         new_mu = torch.empty(mu.size()).to(device)
#         new_var = torch.empty(logvar.size()).to(device)
#         for i in range(3):
#             new_mu[:,base*i:base*(i+1)] =  torch.mul(mu[:,base*i:base*(i+1)],labels[:,i].reshape(-1,1))
#             new_var[:,base*i:base*(i+1)] =  torch.mul(logvar[:,base*i:base*(i+1)],labels[:,i].reshape(-1,1))
        
        msplit = torch.split(mu, [self.z_dim, self.z_dim, self.z_dim], dim=1)
        vsplit = torch.split(logvar, [self.z_dim, self.z_dim, self.z_dim], dim=1)
        new_z = torch.zeros(mu.shape[0], self.z_dim).to(device)
        new_mu = torch.zeros(mu.shape[0], self.z_dim).to(device)
        new_var = torch.zeros(logvar.shape[0], self.z_dim).to(device)
        
        # one hot 
        for i in range(3):
            mi = msplit[i]
            vi = vsplit[i]
            new_mu += mi * labels[:,i].reshape(-1,1)
            new_var += vi * labels[:,i].reshape(-1,1)
            stdev = torch.exp(0.5*vi)
            eps = 0.1 * torch.randn_like(stdev)
            z = mi + eps*stdev
            new_z += z * labels[:,i].reshape(-1,1)

        return new_z, new_mu, new_var
    
    
    def decoder(self, z):
        x = F.relu(self.fc_dec1(z))
        x = F.relu(self.fc_dec2(x))

        x = self.dropout_dec(x)
        
        x1, x2 = torch.split(x, [x.shape[1]//2, x.shape[1]//2], dim=1)
        x1 = F.relu(self.fc_r(x1))
        x2 = F.relu(self.fc_l(x2))
        
        x1 = x1.reshape(x1.shape[0],note_num,-1)
        x2 = x2.reshape(x1.shape[0],note_num,-1)
        right,_ = self.lstm_dec_right(x1)
        left,_ = self.lstm_dec_left(x2)
        del _
        
        right_p = torch.softmax(right[:,:,:len(pdict)], dim=2)
        right_t = torch.softmax(right[:,:,len(pdict):len(pdict)+len(tdict)], dim=2)
        right_v = torch.softmax(right[:,:,len(pdict)+len(tdict):], dim=2)
        
        right_new = torch.cat((right_p,right_t,right_v), dim=2).to(device)
        
        
        left_p = torch.softmax(left[:,:,:len(pdict)], dim=2)
        left_t = torch.softmax(left[:,:,len(pdict):len(pdict)+len(tdict)], dim=2)
        left_v = torch.softmax(left[:,:,len(pdict)+len(tdict):], dim=2)
        
        left_new = torch.cat((left_p,left_t,left_v), dim=2).to(device)
        
        
        return right_new, left_new
    
    
    def forward(self, right, left, labels):
        mu, logvar = self.encoder(right, left)
        z, new_mu, new_var = self.reparameterize(mu, logvar, labels)
        right, left = self.decoder(z)

        return right, left, new_mu, new_var
    
    
    def loss_func(self, right_new, left_new, right, left, mu, logvar):
        CE_right = F.cross_entropy(right_new, right) # BCE = -Negative Log-likelihood
        CE_left = F.cross_entropy(left_new, left) # BCE = -Negative Log-likelihood
        KLD = -1 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) # KL Divergence b/w q_\phi(z|x) || p(z)
        return CE_right + CE_left + KLD

## Training

In [5]:
right_data = np.load('right_data_small.npy')
left_data = np.load('left_data_small.npy')
labels = np.load('labels_small.npy')
batch_size = 5
note_num = 50
train_loader, val_loader, test_loader = generate_data_loader(right_data, left_data, labels, batch_size)
del right_data
del left_data
del labels


lr = 0.001
input_size = len(pdict) + len(vdict) + len(tdict)
lstm_hidden = 80
enc_hidden = 50
z_dim = 256
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'


vae = MidiVAE(input_size, lstm_hidden, enc_hidden, z_dim).to(device)
print(vae)
optimizer = torch.optim.Adam(vae.parameters(), lr=lr)


for epoch in range(1, num_epochs + 1):
    train_epoch_loss = 0
    vae.train()
    for right, left, label in train_loader:
        
        right = right.to(device)
        left = left.to(device)
        label = label.to(device)
        optimizer.zero_grad()

        right_new, left_new, mu, logvar = vae(right, left, label)

        train_loss = vae.loss_func(right_new, left_new, right, left, mu, logvar)
        train_loss.backward()
        optimizer.step()
        train_epoch_loss += train_loss.item()
    print('\n EPOCH {}/{} \t train loss {}'\
                  .format(epoch, num_epochs,train_epoch_loss/len(train_loader)))

MidiVAE(
  (lstm_enc_right): LSTM(166, 80, batch_first=True)
  (lstm_enc_left): LSTM(166, 80, batch_first=True)
  (fc_enc1): Linear(in_features=8000, out_features=2500, bias=True)
  (fc_enc2): Linear(in_features=2500, out_features=2500, bias=True)
  (dropout_enc): Dropout(p=0.5, inplace=False)
  (fc_mu): Linear(in_features=1250, out_features=768, bias=True)
  (fc_var): Linear(in_features=1250, out_features=768, bias=True)
  (fc_dec1): Linear(in_features=256, out_features=768, bias=True)
  (fc_dec2): Linear(in_features=768, out_features=2500, bias=True)
  (fc_r): Linear(in_features=1250, out_features=4000, bias=True)
  (fc_l): Linear(in_features=1250, out_features=4000, bias=True)
  (dropout_dec): Dropout(p=0.3, inplace=False)
  (lstm_dec_right): LSTM(80, 166, batch_first=True)
  (lstm_dec_left): LSTM(80, 166, batch_first=True)
)

 EPOCH 1/10 	 train loss 5.580884815281273

 EPOCH 2/10 	 train loss 5.5214312192041985

 EPOCH 3/10 	 train loss 5.519790085632237

 EPOCH 4/10 	 train loss 

## Check the distribution N(0,1) and N(5,0.5) for latent space

In [6]:
# N(0,1)

vi = torch.zeros(1,z_dim)
mi = torch.zeros(1,z_dim)
stdev = torch.exp(0.5*vi)
eps = torch.randn_like(stdev)
z = mi + eps*stdev
new_p, new_t = vae.decoder(z.to(device))
# right_out = out.squeeze()
# right_p = right_out[:,:len(pdict)]
# right_v = right_out[:,len(pdict):len(pdict)+len(vdict)]

rp = torch.argmax(new_p, dim=2).cpu().detach()
rt = torch.argmax(new_t, dim=2).cpu().detach()

print(rp, rt)

tensor([[80, 82, 82, 82, 81, 85, 78, 87, 89, 89, 89, 89, 84, 87, 80, 80, 89, 82,
         82, 88, 88, 87, 78, 88, 83, 80, 89, 89, 89, 86, 89, 89, 89, 88, 84, 78,
         89, 79, 82, 88, 87, 88, 82, 87, 85, 78, 87, 83, 81, 81]]) tensor([[83, 85, 78, 83, 85, 78, 78, 83, 78, 88, 87, 78, 83, 78, 78, 83, 78, 79,
         78, 78, 81, 89, 80, 79, 88, 79, 83, 81, 85, 79, 85, 78, 78, 79, 78, 78,
         78, 78, 80, 88, 80, 79, 89, 86, 86, 86, 85, 78, 81, 85]])


In [7]:
# N~(5,0.5)
vi = torch.ones(1,z_dim)
mi = torch.ones(1,z_dim)*5
stdev = torch.exp(-0.7*vi)
eps = torch.randn_like(stdev)
z = mi + eps*stdev
new_p, new_t = vae.decoder(z.to(device))

# right_out = out.squeeze()
# right_p = right_out[:,:len(pdict)]
# right_v = right_out[:,len(pdict):len(pdict)+len(vdict)]
# right_t = right_out[:,len(pdict)+len(vdict):]

rp = torch.argmax(new_p, dim=2).cpu().detach()
rt = torch.argmax(new_t, dim=2).cpu().detach()

print(rp, rt)

tensor([[80, 78, 82, 82, 81, 85, 78, 87, 89, 89, 89, 89, 84, 81, 80, 80, 89, 82,
         82, 88, 88, 87, 78, 88, 83, 80, 89, 89, 89, 78, 89, 89, 89, 88, 84, 78,
         89, 79, 82, 88, 87, 88, 82, 87, 85, 78, 87, 83, 81, 81]]) tensor([[83, 85, 78, 83, 85, 78, 78, 83, 78, 88, 87, 78, 83, 78, 78, 83, 78, 79,
         78, 78, 81, 89, 80, 79, 88, 79, 83, 81, 85, 79, 85, 78, 78, 79, 78, 78,
         78, 78, 80, 88, 80, 79, 89, 82, 82, 82, 82, 78, 81, 85]])


## Generate new music (Change the probabilities here)

In [10]:
right_data = np.load('right_data_small.npy')
left_data = np.load('left_data_small.npy')
labels = np.load('labels_small.npy')

####### Change this variable to alter the mixed percentage for three composers #####
prob = torch.tensor([0.6, 0.2, 0.2]).to(device)

for i in range(3):
    idx = random.choices(np.where(labels[:,i] == 1)[0])[0]
    right = np.expand_dims(right_data[idx], axis=0)
    right = torch.from_numpy(right).float()
    left = np.expand_dims(left_data[idx], axis=0)
    left = torch.from_numpy(left).float()
    right = right.to(device)
    left = left.to(device)
    
    vae.eval()
    mu, logvar = vae.encoder(right, left)
    mi = torch.split(mu, [z_dim, z_dim, z_dim], dim=1)[i]
    vi = torch.split(logvar, [z_dim, z_dim, z_dim], dim=1)[i]
    new_z = torch.zeros(mu.shape[0], z_dim).to(device)

    stdev = torch.exp(0.5*vi)
    eps = torch.randn_like(stdev)
    z = mi + eps*stdev
    new_z += z * prob[i].reshape(-1,1)

right_out, left_out = vae.decoder(new_z)

In [11]:
right_out = right_out.squeeze()

right_p = right_out[:,:len(pdict)]
right_t = right_out[:,len(pdict):len(pdict)+len(tdict)]
right_v = right_out[:,len(pdict)+len(tdict):]

rp = torch.argmax(right_p, dim=1)
rt = torch.argmax(right_t, dim=1)
rv = torch.argmax(right_v, dim=1)


left_out = left_out.squeeze()

left_p = left_out[:,:len(pdict)]
left_t = left_out[:,len(pdict):len(pdict)+len(tdict)]
left_v = left_out[:,len(pdict)+len(tdict):]

lp = torch.argmax(left_p, dim=1)
lt = torch.argmax(left_t, dim=1)
lv = torch.argmax(left_v, dim=1)

print(rp, rt, rv)

tensor([49,  9, 39, 48, 74, 50, 46, 46, 58, 46, 51, 46, 51, 24, 46, 52, 74, 45,
        45, 45, 46, 46, 46, 60, 42, 51, 51, 51, 51, 51, 51, 51, 51, 51, 41, 41,
        31, 33, 31, 31, 31, 31, 31, 22, 22, 22, 22, 22, 22, 21],
       device='cuda:0') tensor([ 7,  9,  4,  4,  3,  7,  2,  9, 11, 11, 11, 11,  6,  9,  2,  2, 11,  4,
         4, 10, 10,  9,  6, 10,  5,  9, 11, 11, 11,  8, 11, 11, 11, 10,  6, 11,
        11,  1,  4, 10,  9, 10,  4,  9,  7, 10,  9,  8,  4,  3],
       device='cuda:0') tensor([ 4, 30, 43, 30, 43, 28, 14, 43, 28, 43, 43, 28, 43, 43, 57, 73,  7, 48,
        54, 21, 62, 54, 20,  8, 19, 24, 21, 19, 24, 24, 24, 43, 23, 60, 42,  8,
        21, 60, 21,  8, 22,  6,  8, 21, 21, 21,  6, 28, 57,  6],
       device='cuda:0')


## Store the music into midi file 

In [13]:
def convert2notes(pid, tid, vid, offset,right=True):

    result = note.Note(id2pdict[pid])
    result.offset = offset
    result.storedInstrument = instrument.Piano()
    result.duration.quarterLength = id2tdict[tid]
    print(id2tdict[tid])
    result.volume.velocity = id2vdict[vid]

    return result


right_notes = []
left_notes = []
offset = 0
    
for i in range(len(rp)):
    rnote = convert2notes(rp[i].item(), rt[i].item(), rv[i].item(), offset)
    lnote = convert2notes(lp[i].item(), lt[i].item(), lv[i].item(), offset)

    offset += rnote.duration.quarterLength
    right_notes.append(rnote)
    left_notes.append(lnote)

rightpart = stream.Part(right_notes, id='Piano Right')
leftpart = stream.Part(left_notes, id='Piano Left')
midi_stream = stream.Stream([rightpart, leftpart])
midi_stream.write('midi', fp='music_right_left.mid')

2
2
2/3
2
1.25
1.5
1.25
2
1.0
2
2
0.5
0.75
0.5
2/3
1.5
1/12
1/3
1/12
1/3
1/12
2/3
1/12
2/3
1.75
1.5
2/3
0.5
0.75
0.25
0.75
1.5
1/12
0.25
1.25
0.5
1.25
0.25
1/3
1.75
1/3
1.0
2/3
1/12
1.75
1/3
1/3
0.5
1.5
1/3
2/3
0.5
1/12
1.5
1/12
1.0
1/12
2
1/6
0.5
1/12
2
1/12
0.25
1/12
0.25
1/3
0.5
1.75
0.25
1/12
1.75
1/12
0.75
0.5
0.25
1.25
0.75
1/3
1/3
2/3
0.75
1/3
0.5
1.25
1/12
2/3
1/6
2
1/6
1/3
1/6
2/3
2
1/6
0.5
1.25
1/3
1.0
2


'music_right_left.mid'