In [1]:
from model import *
from torch.utils.tensorboard import SummaryWriter

params = {
    'batch_size' : 16,
    'shuffle' : True,
    'num_workers' : 2,
    'drop_last' : True
}

max_trg_len = 450 # length of all target note sequences, holds 99 notes max
pad_idx = 434

# Define data loaders
train_path = Path(r'X:\Training Data\Model 1 Training\train')
train_data = LazierDataset(train_path, max_trg_len, pad_idx)
train_loader = torch.utils.data.DataLoader(train_data, **params)
val_path = Path(r'X:\Training Data\Model 1 Training\val')
val_data = LazierDataset(val_path, max_trg_len, pad_idx)
val_loader = torch.utils.data.DataLoader(val_data, **params)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Training hyperparameters
num_epochs = 10
learning_rate = 3e-4
batch_size = params['batch_size']

# Model hyperparameters
trg_vocab_size = 435  # <output length>
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1
max_len = 400
forward_expansion = 2048

# Tensorboard for nice plots
writer = SummaryWriter('runs/full_train')
step = 0  # how many times the model has gone through some input

# Define model
model = Transformer(
    embedding_size,
    trg_vocab_size,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss() # Multi-class loss, when you have a many class prediction problem

for epoch in range(num_epochs):
    model.train() # Put model in training mode, so that it knows it's parameters should be updated
    for batch_idx, batch in enumerate(train_loader):
        # Batches come through as a tuple defined in the return statement __getitem__ in the Dataset
        spec, notes = batch[0].to(device), batch[1].to(device)

        # forward prop
        output = model(spec, notes[..., :-1]) # Don't pass the last element into the decoder, want it to be predicted

        output = output.reshape(-1, output.shape[2]) # Reshape the output for use by criterion
        notes = notes[..., 1:].reshape(-1)           # Same for the notes
        optimizer.zero_grad()                        # Zero out the gradient so it doesn't accumulate

        loss = criterion(output, notes)     # Calculate loss, this is output vs ground truth
        loss.backward()                     # Compute loss for every node in the computation graph

        # This line to avoid the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()    # Update model parameters
        writer.add_scalar("Training Loss", loss, global_step=step)
        step += 1

        if batch_idx%25 == 0:
            print('\nEpoch {}, Batch {}'.format(epoch+1,batch_idx))
            print('Training Loss: {}'.format(loss.item()))
        break
    break
    print(f'\nEpoch {epoch+1}/{num_epochs}')
    print(f'Training Loss: {loss.item()}')
    # Evaluate on validation set
    model.eval()
    for batch_idx, batch in enumerate(val_loader):
        spec, notes = batch[0].to(device), batch[1].to(device)

        # forward prop
        output = model(spec, notes[..., :-1]) # Don't pass the last element into the decoder, want it to be predicted

        output = output.reshape(-1, output.shape[2]) # Reshape the output for use by criterion
        notes = notes[..., 1:].reshape(-1)           # Same for the notes
        
        loss = criterion(output, notes)     # Calculate loss, this is output vs ground truth

        writer.add_scalar("Validation Loss", loss, global_step=step)
        step += 1
    print('Validation Loss: {}'.format(loss.item()))





src_seq_len : 400
trg_seq_len : 450
N : 16


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.