In [2]:
from model import *
from torch.utils.tensorboard import SummaryWriter

train_path = Path.cwd() / 'toy training data' / 'preprocessed'
# Define dataset
train_data = Dataset(train_path)
train_loader = torch.utils.data.DataLoader(
    train_data,
    batch_size=1,
    shuffle=False,
    num_workers=0,
    drop_last=True,
)

for batch_idx, batch in enumerate(train_loader):
    # input = spec, output = notes
    spec, notes = batch[0], batch[1]
    print(spec.shape)
    print(notes.shape)
    print(notes)

torch.Size([1, 512, 400])
torch.Size([1, 52])


In [3]:
from model import *
from torch.utils.tensorboard import SummaryWriter

train_path = Path.cwd() / 'toy training data' / 'preprocessed'
# Define dataset
train_data = Dataset(train_path)
train_loader = torch.utils.data.DataLoader(
    train_data,
    batch_size=1,
    shuffle=False,
    num_workers=0,
    drop_last=True,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

# Training hyperparameters
num_epochs = 100
learning_rate = 1e-4
batch_size = 1

# Model hyperparameters
trg_vocab_size = 434  # <output length>
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1
max_len = 400
forward_expansion = 2048

# Tensorboard for nice plots
writer = SummaryWriter('runs/loss_plot')
step = 0  # how many times the model has gone through some input

# Define model
model = Transformer(
    embedding_size,
    trg_vocab_size,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss() # Multi-class loss, when you have a many class prediction problem

for epoch in range(num_epochs):
    model.train() # Put model in training mode, so that it knows it's parameters should be updated
    for batch_idx, batch in enumerate(train_loader):
        # Batches come through as a tuple defined in the return statement __getitem__ in the Dataset
        spec, notes = batch[0].to(device), batch[1].to(device)

        # forward prop
        output = model(spec, notes[..., :-1]) # Don't pass the last element into the decoder, want it to be predicted

        output = output.reshape(-1, output.shape[2]) # Reshape the output for use by criterion
        notes = notes[..., 1:].reshape(-1)           # Same for the notes
        optimizer.zero_grad()                        # Zero out the gradient so it doesn't accumulate

        loss = criterion(output, notes)     # Calculate loss, this is output vs ground truth
        loss.backward()                     # Compute loss for every node in the computation graph

        # This line to avoid the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()    # Update model parameters
        writer.add_scalar("Training Loss", loss, global_step=step)
        step += 1

        # Let's print the output vs the ground truth every 5 epochs for the first 15 epochs
        if epoch in [0, 5, 10, 15, 20]:
            print('\nEpoch {}/{}'.format(epoch, num_epochs))
            print('Loss : {}'.format(loss.item()))
            print('Ground Truth : {}'.format(notes))
            print('Model Output : {}'.format(torch.argmax(output, dim=1)))


In [2]:
print(torch.argmax(output, dim=1))
print(notes)

tensor([  0,  47,   2,  60,   2,  99,   1, 110,   0, 122,   3, 133,   1, 145,
          2, 157,   4, 171,   3, 183,   1, 195,   3, 221,   1, 233,   0, 245,
          2, 259,   2, 297,   1, 309,   0, 322,   3, 334,   1, 346,   2, 359,
          4, 371,   3, 395,   2, 419,   3, 431, 433], device='cuda:0')
tensor([  0,  47,   2,  60,   2,  99,   1, 110,   0, 122,   3, 133,   1, 145,
          2, 157,   4, 171,   3, 183,   1, 195,   3, 221,   1, 233,   0, 245,
          2, 259,   2, 297,   1, 309,   0, 322,   3, 334,   1, 346,   2, 359,
          4, 371,   3, 395,   2, 419,   3, 431, 433], device='cuda:0')
