In [1]:
import random

import numpy as np
from torch.utils.data import DataLoader
import torch
from src.model.model import MusicTransformer
from pathlib import Path

from src.utils.create_dataset import GuitarDataset
from src.utils.hyperparameters import BATCH_SIZE, BLOCK_SIZE, EMBEDDING_DIM, N_LAYER, N_HEAD, DROPOUT, VOCAB_SIZE, \
    LEARNING_RATE

In [2]:
tokenized_folder = Path("../data/tokenized/")

In [3]:
# hyper-parameters
batch_size = BATCH_SIZE

block_size = BLOCK_SIZE
n_embd = EMBEDDING_DIM
vocab_size = VOCAB_SIZE
n_layer = N_LAYER
n_head = N_HEAD
dropout = DROPOUT

learning_rate = LEARNING_RATE
training_split = 0.8

In [4]:
# setting the device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [5]:
# AMP setup, works only on CUDA
use_amp = (device == "cuda")
if use_amp and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported():
    amp_dtype = torch.bfloat16
else:
    amp_dtype = torch.float16

scaler = torch.amp.GradScaler('cuda', enabled=use_amp)

In [6]:
# prepare datasets
all_files = sorted(tokenized_folder.glob("*.json"))
random.shuffle(all_files)

split = int(training_split * len(all_files))
train_ds = GuitarDataset(block_size=block_size, stride=block_size // 2, file_list=all_files[:split])
val_ds = GuitarDataset(block_size=block_size, stride=block_size // 2, file_list=all_files[split:])

In [7]:
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, drop_last=True)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=True, drop_last=True)

In [8]:
print("Training tokens count", train_dl.dataset.tokens)
print("Validation tokens count", val_dl.dataset.tokens)

Training tokens count 56874
Validation tokens count 11013


In [9]:
model = MusicTransformer(
    vocab_size=vocab_size,
    n_embd=n_embd,
    n_head=n_head,
    n_layer=n_layer,
    block_size=block_size,
    dropout=dropout
).to(device)

In [10]:
# using AdamW optimisation
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.95), weight_decay=0.1)

In [11]:
print("Parameter count: ", sum([p.numel() for p in model.parameters()]))
print("Training on ", device)
print("Using amp?", use_amp)

Parameter count:  7293248
Training on  mps
Using amp? False


In [12]:
# from src.model.model import Head
# import numpy as np
# x_test, y_test = train_ds.__getitem__(56)
# x_test, y_test = x_test.view(1,-1).to(device), y_test.view(1,-1).to(device)
#
# print(-np.log(1/8000))
#
# logits, loss = model(x_test, y_test)
# loss

In [13]:
epochs = 40
for epoch in range(epochs):
    # ---- train -----
    model.train()
    for x, y in train_dl:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad(set_to_none=True)

        # using torch.autocast here with device_type to avoid backend-specific contexts
        with torch.amp.autocast('cuda', enabled=torch.cuda.is_available()):
            logits, loss = model(x, y)

        if use_amp:
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

    # ----- validate -----
    model.eval()
    val_loss, n = 0.0, 0
    with torch.no_grad():
        for x, y in val_dl:
            x, y = x.to(device), y.to(device)
            _, l = model(x, y)
            val_loss += l.item() * x.size(0)
            n += x.size(0)
    print(f"echo {epoch:03d} train {loss.item():.4f} val_loss {val_loss / max(n, 1):.4f}")

echo 000 train 8.3482 val_loss 8.8177
echo 001 train 8.0376 val_loss 9.0495
echo 002 train 7.9794 val_loss 9.1665
echo 003 train 7.7498 val_loss 9.1924
echo 004 train 7.5435 val_loss 9.1260
echo 005 train 7.2530 val_loss 9.1353
echo 006 train 6.9629 val_loss 9.1272
echo 007 train 6.6261 val_loss 9.1576
echo 008 train 6.4796 val_loss 9.0593
echo 009 train 5.9986 val_loss 9.0749
echo 010 train 5.8222 val_loss 8.9571
echo 011 train 5.4572 val_loss 8.9254
echo 012 train 5.4504 val_loss 8.9660
echo 013 train 4.9782 val_loss 8.9374
echo 014 train 4.7423 val_loss 8.9618
echo 015 train 4.6196 val_loss 8.8332
echo 016 train 4.2551 val_loss 8.8128
echo 017 train 4.1415 val_loss 8.8350
echo 018 train 3.9037 val_loss 8.7589
echo 019 train 3.8274 val_loss 8.7422
echo 020 train 3.4322 val_loss 8.8496
echo 021 train 3.1950 val_loss 8.7374
echo 022 train 3.1859 val_loss 8.7140
echo 023 train 2.7347 val_loss 8.8068
echo 024 train 2.6953 val_loss 8.7132
echo 025 train 2.6162 val_loss 8.8279
echo 026 tra

In [23]:
x, y = val_dl.dataset.__getitem__(5)
temp = x[:50]
out = model.generate(temp.view(1, -1).to(device), max_new_tokens=100).cpu()
out

tensor([[ 446, 2392, 3269,   45, 3049,   43,  658, 6527, 4100, 3049, 7612, 7613,
          704,   38,  106,  120, 2490, 1646, 2097,  658, 2041, 2399, 7692, 7698,
          496, 6527, 5731, 3049,   47, 6148, 2182,  704,   45, 1646,  283, 2490,
         1646, 2097,  658, 2041, 2399, 7692, 7698,  496, 6527, 4100, 3049, 7612,
         7613, 1523, 3250, 5876, 3588, 3738,   35, 1659, 6171, 5164, 3049,   47,
         1211,   61,  107, 7671, 3189,  555, 2093,  681, 2273,  581, 3900, 4932,
         7607, 3900, 4932, 7607, 6150, 1896, 1868, 4989, 4802, 3680, 6016, 3755,
         7513, 2017, 1447, 3865, 7866, 7875,   52,  635, 7728, 4347, 4910, 3409,
           69,  642,  486, 1699, 2475,   32, 2105, 2251,   42, 7903, 5157, 4114,
           30, 2477,   47,  496, 2678,  111,  493,   66,  623, 1420,   31,  681,
         1494, 6081, 3605, 1030, 7308, 1422,  296, 6889, 2459,  995,   37,  544,
          463,   15, 1047,  970, 5795,  466, 3461, 5493,  484,  623, 1912,  919,
          995, 1572, 4394, 6