In [1]:
from model import *
from torch.utils.tensorboard import SummaryWriter

params = {
    'batch_size' : 16,
    'shuffle' : True,
    'num_workers' : 4,
    'drop_last' : True
}

# NOTE: max_trg_len <= max_src_len otherwise side asset error is triggered
max_trg_len = 500 # length of all target note sequences, holds 99 notes max
max_src_len = 500
pad_idx = 434

# Define data loaders
train_path = Path(r'X:\Training Data\Model 1 Training\train')
train_data = LazierDataset(train_path, max_src_len, max_trg_len, pad_idx)
train_loader = torch.utils.data.DataLoader(train_data, **params)
val_path = Path(r'X:\Training Data\Model 1 Training\val')
val_data = LazierDataset(val_path, max_src_len, max_trg_len, pad_idx)
val_loader = torch.utils.data.DataLoader(val_data, **params)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Training hyperparameters
learning_rate = 3e-4
batch_size = params['batch_size']

# Model hyperparameters
trg_vocab_size = 435  # <output length>
embedding_size = 512
num_heads = 8
num_encoder_layers = 2
num_decoder_layers = 2
dropout = 0.1
max_len = max_src_len
forward_expansion = 2048

# Tensorboard for nice plots
writer = SummaryWriter('runs/model5')
step = 0  # how many times the model has gone through some input

# Define model
model = Transformer(
    embedding_size,
    trg_vocab_size,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)


# torch.save(model.state_dict(), 'model.pt')
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# criterion = nn.CrossEntropyLoss() # Multi-class loss, when you have a many class prediction problem
criterion = nn.CrossEntropyLoss(ignore_index=434)

num_epochs = 30
model.train() # Put model in training mode, so that it knows it's parameters should be updated
for epoch in range(num_epochs):
    for batch_idx, batch in enumerate(train_loader):
        # Batches come through as a tuple defined in the return statement __getitem__ in the Dataset
        spec, notes = batch[0].to(device), batch[1].to(device)

        # forward prop
        output = model(spec, notes[..., :-1]) # Don't pass the last element into the decoder, want it to be predicted
        # print('output shape : {}'.format(output.shape))
        # output = output.reshape(-1, output.shape[2]) # Reshape the output for use by criterion
        notes = notes[..., 1:] # .reshape(-1)           # Same for the notes
        # print('notes shape 2 {}'.format(notes.shape))
        optimizer.zero_grad()                        # Zero out the gradient so it doesn't accumulate

        loss = criterion(output.permute(0,2,1), notes)     # Calculate loss, this is output vs ground truth
        loss.backward()                     # Compute loss for every node in the computation graph

        # This line to avoid the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()    # Update model parameters
        writer.add_scalar("Training Loss", loss, global_step=step)
        step += 1

        if batch_idx%25 == 0:
            print('\nEpoch {}, Batch {}'.format(epoch+1,batch_idx))
            print('Training Loss: {}'.format(loss.item()))
        
        # if batch_idx%100 == 0:
            # print('Ground Truth (sample) : {}'.format(notes[0]))
            # print('Canidate (sample)     : {}'.format(torch.argmax(output[0], dim=1)))
        

    print(f'\nEpoch {epoch+1}/{num_epochs}')
    print(f'Training Loss: {loss.item()}')
    torch.save(model.state_dict(), 'model5.pt')
    print('model saved')
    # # Evaluate on validation set
    # model.eval()
    # for batch_idx, batch in enumerate(val_loader):
        # spec, notes = batch[0].to(device), batch[1].to(device)

        # # forward prop
        # output = model(spec, notes[..., :-1]) # Don't pass the last element into the decoder, want it to be predicted

        # # output = output.reshape(-1, output.shape[2]) # Reshape the output for use by criterion
        # notes = notes[..., 1:] # .reshape(-1)           # Same for the notes
        
        # loss = criterion(output.permute(0,2,1), notes)     # Calculate loss, this is output vs ground truth

        # writer.add_scalar("Validation Loss", loss, global_step=step)
        # step += 1
    # print('Validation Loss: {}'.format(loss.item()))






Epoch 1, Batch 0
Training Loss: 6.309049129486084

Epoch 1, Batch 25
Training Loss: 4.7210588455200195

Epoch 1, Batch 50
Training Loss: 4.010907173156738

Epoch 1, Batch 75
Training Loss: 3.8372321128845215

Epoch 1, Batch 100
Training Loss: 3.91545033454895

Epoch 1, Batch 125
Training Loss: 3.8134384155273438

Epoch 1, Batch 150
Training Loss: 3.718294382095337

Epoch 1, Batch 175
Training Loss: 3.78564453125

Epoch 1, Batch 200
Training Loss: 3.572188138961792

Epoch 1, Batch 225
Training Loss: 3.5495753288269043

Epoch 1, Batch 250
Training Loss: 3.4048354625701904

Epoch 1, Batch 275
Training Loss: 3.539491891860962

Epoch 1, Batch 300
Training Loss: 3.363840103149414

Epoch 1, Batch 325
Training Loss: 3.253021001815796

Epoch 1, Batch 350
Training Loss: 3.4574532508850098

Epoch 1, Batch 375
Training Loss: 3.4201650619506836

Epoch 1, Batch 400
Training Loss: 3.3002572059631348

Epoch 1, Batch 425
Training Loss: 3.300520896911621

Epoch 1, Batch 450
Training Loss: 3.11541724205

In [1]:
from model import *
from torch.utils.tensorboard import SummaryWriter

params = {
    'batch_size' : 16,
    'shuffle' : True,
    'num_workers' : 0,
    'drop_last' : True
}

# NOTE: max_trg_len <= max_src_len otherwise side asset error is triggered
max_trg_len = 500 # length of all target note sequences, holds 99 notes max
max_src_len = 500
pad_idx = 434

# Define data loaders
train_path = Path(r'X:\Training Data\Model 1 Training\train')
train_data = LazierDataset(train_path, max_src_len, max_trg_len, pad_idx)
train_loader = torch.utils.data.DataLoader(train_data, **params)
val_path = Path(r'X:\Training Data\Model 1 Training\val')
val_data = LazierDataset(val_path, max_src_len, max_trg_len, pad_idx)
val_loader = torch.utils.data.DataLoader(val_data, **params)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Training hyperparameters
learning_rate = 3e-4
batch_size = params['batch_size']

# Model hyperparameters
trg_vocab_size = 435  # <output length>
embedding_size = 512
num_heads = 8
num_encoder_layers = 2
num_decoder_layers = 2
dropout = 0.1
max_len = max_src_len
forward_expansion = 2048

# Tensorboard for nice plots
writer = SummaryWriter('runs/model5')
step = 8885  # how many times the model has gone through some input

# Define model
model = Transformer(
    embedding_size,
    trg_vocab_size,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

model.load_state_dict(torch.load(r'C:\Users\ewais\Documents\GitHub\tensor-hero\Model_1\model5.pt'))

# torch.save(model.state_dict(), 'model.pt')
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# criterion = nn.CrossEntropyLoss() # Multi-class loss, when you have a many class prediction problem
criterion = nn.CrossEntropyLoss(ignore_index=434)

start_epoch = 3
num_extra_epochs = 17

for epoch in range(start_epoch, start_epoch+num_extra_epochs):
    model.train() # Put model in training mode, so that it knows it's parameters should be updated
    for batch_idx, batch in enumerate(train_loader):
        # Batches come through as a tuple defined in the return statement __getitem__ in the Dataset
        spec, notes = batch[0].to(device), batch[1].to(device)

        # forward prop
        output = model(spec, notes[..., :-1]) # Don't pass the last element into the decoder, want it to be predicted
        # print('output shape : {}'.format(output.shape))
        # output = output.reshape(-1, output.shape[2]) # Reshape the output for use by criterion
        notes = notes[..., 1:] # .reshape(-1)           # Same for the notes
        # print('notes shape 2 {}'.format(notes.shape))
        optimizer.zero_grad()                        # Zero out the gradient so it doesn't accumulate

        loss = criterion(output.permute(0,2,1), notes)     # Calculate loss, this is output vs ground truth
        loss.backward()                     # Compute loss for every node in the computation graph

        # This line to avoid the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()    # Update model parameters
        writer.add_scalar("Training Loss", loss, global_step=step)
        step += 1

        if batch_idx%25 == 0:
            print('\nEpoch {}, Batch {}'.format(epoch+1,batch_idx))
            print('Training Loss: {}'.format(loss.item()))
        
        # if batch_idx%100 == 0:
            # print('Ground Truth (sample) : {}'.format(notes[0]))
            # print('Canidate (sample)     : {}'.format(torch.argmax(output[0], dim=1)))

    print(f'\nEpoch {epoch+1}/{start_epoch+num_extra_epochs}')
    print(f'Training Loss: {loss.item()}')
    torch.save(model.state_dict(), 'model5.pt')
    # # Evaluate on validation set
    # model.eval()
    # for batch_idx, batch in enumerate(val_loader):
        # spec, notes = batch[0].to(device), batch[1].to(device)

        # # forward prop
        # output = model(spec, notes[..., :-1]) # Don't pass the last element into the decoder, want it to be predicted

        # # output = output.reshape(-1, output.shape[2]) # Reshape the output for use by criterion
        # notes = notes[..., 1:] # .reshape(-1)           # Same for the notes
        
        # loss = criterion(output.permute(0,2,1), notes)     # Calculate loss, this is output vs ground truth

        # writer.add_scalar("Validation Loss", loss, global_step=step)
        # step += 1
    # print('Validation Loss: {}'.format(loss.item()))






Epoch 4, Batch 0
Training Loss: 1.7855188846588135

Epoch 4, Batch 25
Training Loss: 2.198181629180908

Epoch 4, Batch 50
Training Loss: 1.9903161525726318

Epoch 4, Batch 75
Training Loss: 2.1484198570251465

Epoch 4, Batch 100
Training Loss: 2.0231077671051025

Epoch 4, Batch 125
Training Loss: 1.8063194751739502

Epoch 4, Batch 150
Training Loss: 2.307825803756714

Epoch 4, Batch 175
Training Loss: 2.0214645862579346

Epoch 4, Batch 200
Training Loss: 2.2710390090942383

Epoch 4, Batch 225
Training Loss: 1.6788023710250854

Epoch 4, Batch 250
Training Loss: 2.0528149604797363

Epoch 4, Batch 275
Training Loss: 1.8540153503417969

Epoch 4, Batch 300
Training Loss: 2.053788900375366

Epoch 4, Batch 325
Training Loss: 2.0915684700012207

Epoch 4, Batch 350
Training Loss: 2.138932943344116

Epoch 4, Batch 375
Training Loss: 2.070681571960449

Epoch 4, Batch 400
Training Loss: 1.9695371389389038

Epoch 4, Batch 425
Training Loss: 1.7261240482330322

Epoch 4, Batch 450
Training Loss: 1.8

In [None]:
torch.save(model.state_dict(), 'model4.pt')