In [None]:
from model import *
from torch.utils.tensorboard import SummaryWriter

train_path = Path.cwd() / 'toy training data' / 'preprocessed'
# Define dataset
train_data = Dataset(train_path)
train_loader = torch.utils.data.DataLoader(
    train_data,
    batch_size=1,
    shuffle=False,
    num_workers=0    
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Training hyperparameters
num_epochs = 10
learning_rate = 1e-4
batch_size = 1

# Model hyperparameters
# src_vocab_size = 0 # There is no src vocab since the src is spectrogram frames
trg_vocab_size = 0 # <output length>
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1
max_len = 400
forward_expansion = 2048
src_pad_idx = 0 # ???

# Tensorboard for nice plots
writer = SummaryWriter('runs/loss_plot')
step = 0

# Define model
model = Transformer(
    embedding_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = 0 # TODO 
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        spec, notes = batch.to(device)

        # forward prop
        output = model(spec, notes[:-1])

        output = output.reshape(-1, output.shape[2])
        notes = notes[1:].reshape(-1)
        optimizer.zero_grad()

        loss = criterion(output, notes)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()
        writer.add_scalar("Training Loss", loss, global_step=step)
        step += 1

