In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Put the archived tablature dataset in the following drive location: /content/drive/MyDrive/cs1430/chord-classifier/tablature_dataset.7z

In [2]:
%cd /content
!cp /content/drive/MyDrive/cs1430/chord-classifier/tablature_dataset.7z /content/tablature_dataset.7z
!7z x tablature_dataset.7z


/content

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,8 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 3678389168 bytes (3508 MiB)

Extracting archive: tablature_dataset.7z
--
Path = tablature_dataset.7z
Type = 7z
Physical Size = 3678389168
Headers Size = 16228
Method = Delta LZMA2:24
Solid = +
Blocks = 2

  0%      0% 5        0% 7 - tablature_dataset/tablature_frames/0_10.png                                                      0% 9 - tablature_dataset/tablature_frames/0_101.png                                                       0% 11 - tablature_dataset/tablature_frames/0_103.

Compute std and mean of the dataset for standardization.

Load dataset into memory.

In [4]:
import os
from PIL import Image
import numpy as np
from tqdm import tqdm

image_folder = 'tablature_dataset/tablature_frames'
image_files = [f for f in os.listdir(image_folder) if f.endswith('.png')]
image_files.sort(key=lambda x: (int(x.split('_')[0]), int(x.split('_')[1].split('.')[0])))
image_files = np.stack(image_files)

# Load all images into memory for fast access.
global_image_arr = []

for i, image_file in enumerate(tqdm(image_files)):
  img_path = os.path.join(image_folder, image_file)
  image = Image.open(img_path).convert('RGB')
  image = Image.open(img_path).convert('L')
  image = image.crop((image.size[0]/3, image.size[1]/2, image.size[0], image.size[1]))
  image = image.resize((320, 270))
  image = np.array(image)
  image = image / 255.0
  global_image_arr.append(image)

global_image_arr = np.stack(global_image_arr)

100%|██████████| 1995/1995 [05:28<00:00,  6.07it/s]


In [5]:
mean = np.mean(global_image_arr, axis=0)
std = np.std(global_image_arr, axis=0)

np.save('mean_std.npy', (mean, std))

In [6]:
import os
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split

class GuitarTabDataset(Dataset):
    def __init__(self, image_folder, label_folder, indices, transform=None):
        self.image_folder = image_folder
        self.label_folder = label_folder
        self.transform = transform
        if self.transform == None:
          self.transform = transforms.Compose([
              transforms.ToTensor(),
              transforms.RandomResizedCrop(size=(270, 320), scale=(0.9, 1.0), ratio=(0.9, 1.2)),
              transforms.ColorJitter(brightness=0.1, contrast=0.1),
              transforms.Normalize(mean, std)
          ])

        # Image files are loaded and sorted above in global_image_arr

        # Load all labels
        self.labels = []
        for i in range(25):  # Assuming 24 npy files corresponding to batches
            self.labels.extend(np.load(os.path.join(label_folder, f"{i}.npy")))
        self.labels = np.stack(self.labels).astype(np.float32)

        # Filter files and labels based on provided indices
        self.image_files = global_image_arr[indices]
        self.labels = self.labels[indices]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image = self.image_files[idx]
        if self.transform:
          image = self.transform(image)
        image = image.type(torch.float32)
        label = self.labels[idx]

        # Hack to make open
        open_strings = (np.sum(label, axis=1) == 0)
        label[open_strings, 0] = 1
        return image, label

In [7]:
from sklearn.model_selection import train_test_split

# Assuming all files are loaded to calculate indices
all_image_files = [f for f in os.listdir('tablature_dataset/tablature_frames') if f.endswith('.png')]
all_image_files.sort(key=lambda x: (int(x.split('_')[0]), int(x.split('_')[1].split('.')[0])))
all_indices = list(range(len(all_image_files)))

# Perform the split
train_indices, val_indices = train_test_split(all_indices, test_size=0.2, random_state=42)
transform = None

# Create dataset instances
train_dataset = GuitarTabDataset(image_folder='tablature_dataset/tablature_frames', label_folder='tablature_dataset/tablature_labels_converted', indices=train_indices, transform=transform)
val_dataset = GuitarTabDataset(image_folder='tablature_dataset/tablature_frames', label_folder='tablature_dataset/tablature_labels_converted', indices=val_indices, transform=transform)

Inspect the dataset.

In [8]:
import torchvision

image, label = train_dataset[8]
# Unnormalize image
image = (image * std) + mean
image = torchvision.transforms.functional.to_pil_image(image)

#image
label


array([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.]], dtype=float32)

In [None]:
import torch.optim as optim

class GuitarTabCNN(nn.Module):
    def __init__(self):
        super(GuitarTabCNN, self).__init__()
        self.sequential = nn.Sequential(
          nn.Conv2d(1, 64, kernel_size=11, stride=3),
          nn.BatchNorm2d(64),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=3, stride=2, padding=0),
          nn.Conv2d(64, 128, kernel_size=7, stride=1, padding=2),
          nn.BatchNorm2d(128),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=3, stride=2, padding=0),
          nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
          nn.BatchNorm2d(256),
          nn.ReLU(),
          nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
          nn.BatchNorm2d(384),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=3, stride=2, padding=0),
          nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
          nn.BatchNorm2d(384),
          nn.ReLU(),
          nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
          nn.BatchNorm2d(384),
          nn.ReLU(),
          nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
          nn.BatchNorm2d(256),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=3, stride=2, padding=0),
          nn.Flatten(),
          nn.Linear(5120, 2000),
          nn.BatchNorm1d(2000),
          nn.ReLU(),
          nn.Linear(2000, 1000),
          nn.BatchNorm1d(1000),
          nn.ReLU(),
          nn.Linear(1000, 1000),
          nn.BatchNorm1d(1000),
          nn.ReLU(),
          nn.Linear(1000, 6 * 21)
        )


    def forward(self, x):
        x = self.sequential(x)

        # Softmax each set of 21 fret logits. This represents the probability that a particular fret is pressed for that string.
        x = torch.nn.functional.softmax(x.view(-1, 21), dim=1).view(-1, 6*21)
        return x

model = GuitarTabCNN()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=2e-5)
# optimizer = optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

# Loss function
# Weight the loss higher on non-open strings, since most strings are open which leads to a degenerate solution.
weight_vec = torch.ones(21, dtype=torch.float32)
weight_vec[0] = 0.04
criterion = nn.CrossEntropyLoss(weight=weight_vec)

In [None]:
!pip install torcheval

import torch.optim as optim
from tqdm import tqdm
import torcheval.metrics as metrics

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total trainable parameters: {total_params}')

acc = metrics.MulticlassAccuracy()
pressed_acc = metrics.MulticlassAccuracy()
open_acc = metrics.MulticlassAccuracy()

# Training loop with validation
for epoch in range(30):  # number of epochs
    print(f'Epoch {epoch+1}:')
    model.train()
    images_total = 0.0
    running_loss = 0.0
    progress_bar = tqdm(train_loader)
    for i, (images, labels) in enumerate(progress_bar):
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs.view(-1, 21), labels.view(-1, 21))
        loss.backward()
        optimizer.step()

        string_outputs = outputs.view(-1, 21)
        string_labels = labels.view(-1, 21)
        pressed_labels = (string_labels[:, 0] != 1)
        open_labels = (string_labels[:, 0] == 1.0)

        acc.update(torch.argmax(string_outputs, dim=1), torch.argmax(string_labels, dim=1))
        open_acc.update(torch.argmax(string_outputs[open_labels], dim=1), torch.argmax(string_labels[open_labels], dim=1))
        pressed_acc.update(torch.argmax(string_outputs[pressed_labels], dim=1), torch.argmax(string_labels[pressed_labels], dim=1))
        running_loss += loss.item() * images.shape[0]
        images_total += images.shape[0]
        progress_bar.set_description(f"Loss: {(running_loss / images_total):.4f}, Acc: {(acc.compute()):.4f}, Pressed Acc: {(pressed_acc.compute()):.4f}, Open Acc: {(open_acc.compute()):.4f}")

    acc.reset()
    pressed_acc.reset()
    open_acc.reset()

    # Validation
    model.eval()
    val_loss = 0.0
    val_images_total = 0.0
    with torch.no_grad():
        for images, labels in val_loader:
            outputs = model(images)
            loss = criterion(outputs.view(-1, 21), labels.view(-1, 21))
            val_loss += loss.item() * images.shape[0]
            val_images_total += images.shape[0]

            string_outputs = outputs.view(-1, 21)
            string_labels = labels.view(-1, 21)
            pressed_labels = (string_labels[:, 0] != 1)
            open_labels = (string_labels[:, 0] == 1)

            acc.update(torch.argmax(string_outputs, dim=1), torch.argmax(string_labels, dim=1))
            pressed_acc.update(torch.argmax(string_outputs[pressed_labels], dim=1), torch.argmax(string_labels[pressed_labels], dim=1))
            open_acc.update(torch.argmax(string_outputs[open_labels], dim=1), torch.argmax(string_labels[open_labels], dim=1))
    print(f"Epoch {epoch+1}, Training Loss: {(running_loss / images_total):.4f}, Validation Loss: {(val_loss / val_images_total):.4f}, Acc: {(acc.compute()):.4f}, Pressed Acc: {(pressed_acc.compute()):.4f}, Open Acc: {(open_acc.compute()):.4f}")
    acc.reset()
    pressed_acc.reset()
    open_acc.reset()

    torch.save(model.state_dict(), 'model.pt')

Total trainable parameters: 18511438
Epoch 1:


Loss: 0.6894, Acc: 0.8178, Pressed Acc: 0.9039, Open Acc: 0.7846: 100%|██████████| 200/200 [00:38<00:00,  5.25it/s]


Epoch 1, Training Loss: 0.6894, Validation Loss: 0.6773, Acc: 0.7853, Pressed Acc: 0.8643, Open Acc: 0.7564
Epoch 2:


Loss: 0.6888, Acc: 0.8190, Pressed Acc: 0.9043, Open Acc: 0.7862: 100%|██████████| 200/200 [00:39<00:00,  5.06it/s]


Epoch 2, Training Loss: 0.6888, Validation Loss: 0.6761, Acc: 0.7924, Pressed Acc: 0.8674, Open Acc: 0.7650
Epoch 3:


Loss: 0.6892, Acc: 0.8217, Pressed Acc: 0.9054, Open Acc: 0.7895: 100%|██████████| 200/200 [00:37<00:00,  5.26it/s]


Epoch 3, Training Loss: 0.6892, Validation Loss: 0.6743, Acc: 0.7924, Pressed Acc: 0.8705, Open Acc: 0.7638
Epoch 4:


Loss: 0.6868, Acc: 0.8258, Pressed Acc: 0.9125, Open Acc: 0.7924: 100%|██████████| 200/200 [00:38<00:00,  5.22it/s]


Epoch 4, Training Loss: 0.6868, Validation Loss: 0.6815, Acc: 0.7749, Pressed Acc: 0.8456, Open Acc: 0.7490
Epoch 5:


Loss: 0.6895, Acc: 0.8215, Pressed Acc: 0.9020, Open Acc: 0.7905: 100%|██████████| 200/200 [00:37<00:00,  5.28it/s]


Epoch 5, Training Loss: 0.6895, Validation Loss: 0.6767, Acc: 0.7949, Pressed Acc: 0.8612, Open Acc: 0.7707
Epoch 6:


Loss: 0.6890, Acc: 0.8213, Pressed Acc: 0.9077, Open Acc: 0.7880: 100%|██████████| 200/200 [00:38<00:00,  5.15it/s]


Epoch 6, Training Loss: 0.6890, Validation Loss: 0.6777, Acc: 0.7903, Pressed Acc: 0.8612, Open Acc: 0.7644
Epoch 7:


Loss: 0.6873, Acc: 0.8208, Pressed Acc: 0.9099, Open Acc: 0.7865: 100%|██████████| 200/200 [00:37<00:00,  5.28it/s]


Epoch 7, Training Loss: 0.6873, Validation Loss: 0.6723, Acc: 0.8108, Pressed Acc: 0.8814, Open Acc: 0.7849
Epoch 8:


Loss: 0.6853, Acc: 0.8259, Pressed Acc: 0.9170, Open Acc: 0.7908: 100%|██████████| 200/200 [00:38<00:00,  5.20it/s]


Epoch 8, Training Loss: 0.6853, Validation Loss: 0.6705, Acc: 0.7995, Pressed Acc: 0.8799, Open Acc: 0.7701
Epoch 9:


Loss: 0.6868, Acc: 0.8262, Pressed Acc: 0.9122, Open Acc: 0.7931: 100%|██████████| 200/200 [00:37<00:00,  5.29it/s]


Epoch 9, Training Loss: 0.6868, Validation Loss: 0.6825, Acc: 0.7920, Pressed Acc: 0.8424, Open Acc: 0.7735
Epoch 10:


Loss: 0.6878, Acc: 0.8252, Pressed Acc: 0.9099, Open Acc: 0.7925: 100%|██████████| 200/200 [00:38<00:00,  5.22it/s]


Epoch 10, Training Loss: 0.6878, Validation Loss: 0.6715, Acc: 0.7945, Pressed Acc: 0.8783, Open Acc: 0.7638
Epoch 11:


Loss: 0.6879, Acc: 0.8229, Pressed Acc: 0.9080, Open Acc: 0.7901: 100%|██████████| 200/200 [00:38<00:00,  5.26it/s]


Epoch 11, Training Loss: 0.6879, Validation Loss: 0.6787, Acc: 0.7874, Pressed Acc: 0.8534, Open Acc: 0.7633
Epoch 12:


Loss: 0.6872, Acc: 0.8277, Pressed Acc: 0.9103, Open Acc: 0.7959: 100%|██████████| 200/200 [00:38<00:00,  5.24it/s]


Epoch 12, Training Loss: 0.6872, Validation Loss: 0.6712, Acc: 0.8028, Pressed Acc: 0.8768, Open Acc: 0.7758
Epoch 13:


Loss: 0.6867, Acc: 0.8299, Pressed Acc: 0.9099, Open Acc: 0.7990: 100%|██████████| 200/200 [00:38<00:00,  5.20it/s]


Epoch 13, Training Loss: 0.6867, Validation Loss: 0.6717, Acc: 0.8045, Pressed Acc: 0.8783, Open Acc: 0.7775
Epoch 14:


Loss: 0.6837, Acc: 0.8392, Pressed Acc: 0.9185, Open Acc: 0.8086: 100%|██████████| 200/200 [00:38<00:00,  5.21it/s]


Epoch 14, Training Loss: 0.6837, Validation Loss: 0.6699, Acc: 0.8049, Pressed Acc: 0.8846, Open Acc: 0.7758
Epoch 15:


Loss: 0.6845, Acc: 0.8375, Pressed Acc: 0.9182, Open Acc: 0.8064: 100%|██████████| 200/200 [00:38<00:00,  5.14it/s]


Epoch 15, Training Loss: 0.6845, Validation Loss: 0.6711, Acc: 0.8095, Pressed Acc: 0.8814, Open Acc: 0.7832
Epoch 16:


Loss: 0.6846, Acc: 0.8367, Pressed Acc: 0.9148, Open Acc: 0.8066: 100%|██████████| 200/200 [00:38<00:00,  5.23it/s]


Epoch 16, Training Loss: 0.6846, Validation Loss: 0.6720, Acc: 0.8037, Pressed Acc: 0.8736, Open Acc: 0.7781
Epoch 17:


Loss: 0.6827, Acc: 0.8382, Pressed Acc: 0.9227, Open Acc: 0.8057: 100%|██████████| 200/200 [00:38<00:00,  5.24it/s]


Epoch 17, Training Loss: 0.6827, Validation Loss: 0.6698, Acc: 0.8141, Pressed Acc: 0.8846, Open Acc: 0.7884
Epoch 18:


Loss: 0.6828, Acc: 0.8381, Pressed Acc: 0.9245, Open Acc: 0.8048: 100%|██████████| 200/200 [00:37<00:00,  5.28it/s]


Epoch 18, Training Loss: 0.6828, Validation Loss: 0.6676, Acc: 0.8191, Pressed Acc: 0.8939, Open Acc: 0.7918
Epoch 19:


Loss: 0.6820, Acc: 0.8347, Pressed Acc: 0.9219, Open Acc: 0.8011: 100%|██████████| 200/200 [00:38<00:00,  5.14it/s]


Epoch 19, Training Loss: 0.6820, Validation Loss: 0.6687, Acc: 0.8120, Pressed Acc: 0.8877, Open Acc: 0.7844
Epoch 20:


Loss: 0.6829, Acc: 0.8404, Pressed Acc: 0.9234, Open Acc: 0.8084: 100%|██████████| 200/200 [00:37<00:00,  5.27it/s]


Epoch 20, Training Loss: 0.6829, Validation Loss: 0.6708, Acc: 0.8116, Pressed Acc: 0.8799, Open Acc: 0.7867
Epoch 21:


Loss: 0.6800, Acc: 0.8465, Pressed Acc: 0.9328, Open Acc: 0.8132: 100%|██████████| 200/200 [00:38<00:00,  5.23it/s]


Epoch 21, Training Loss: 0.6800, Validation Loss: 0.6689, Acc: 0.8133, Pressed Acc: 0.8846, Open Acc: 0.7872
Epoch 22:


Loss: 0.6825, Acc: 0.8449, Pressed Acc: 0.9242, Open Acc: 0.8144: 100%|██████████| 200/200 [00:38<00:00,  5.23it/s]


Epoch 22, Training Loss: 0.6825, Validation Loss: 0.6725, Acc: 0.8091, Pressed Acc: 0.8705, Open Acc: 0.7867
Epoch 23:


Loss: 0.6813, Acc: 0.8444, Pressed Acc: 0.9268, Open Acc: 0.8126: 100%|██████████| 200/200 [00:37<00:00,  5.27it/s]


Epoch 23, Training Loss: 0.6813, Validation Loss: 0.6663, Acc: 0.8158, Pressed Acc: 0.8970, Open Acc: 0.7861
Epoch 24:


Loss: 0.6824, Acc: 0.8453, Pressed Acc: 0.9208, Open Acc: 0.8163: 100%|██████████| 200/200 [00:38<00:00,  5.19it/s]


Epoch 24, Training Loss: 0.6824, Validation Loss: 0.6773, Acc: 0.7982, Pressed Acc: 0.8549, Open Acc: 0.7775
Epoch 25:


Loss: 0.6844, Acc: 0.8381, Pressed Acc: 0.9163, Open Acc: 0.8080: 100%|██████████| 200/200 [00:37<00:00,  5.29it/s]


Epoch 25, Training Loss: 0.6844, Validation Loss: 0.6706, Acc: 0.8204, Pressed Acc: 0.8908, Open Acc: 0.7946
Epoch 26:


Loss: 0.6792, Acc: 0.8464, Pressed Acc: 0.9328, Open Acc: 0.8131: 100%|██████████| 200/200 [00:38<00:00,  5.23it/s]


Epoch 26, Training Loss: 0.6792, Validation Loss: 0.6675, Acc: 0.8124, Pressed Acc: 0.8924, Open Acc: 0.7832
Epoch 27:


Loss: 0.6807, Acc: 0.8482, Pressed Acc: 0.9279, Open Acc: 0.8174: 100%|██████████| 200/200 [00:38<00:00,  5.25it/s]


Epoch 27, Training Loss: 0.6807, Validation Loss: 0.6708, Acc: 0.8145, Pressed Acc: 0.8846, Open Acc: 0.7889
Epoch 28:


Loss: 0.6799, Acc: 0.8507, Pressed Acc: 0.9298, Open Acc: 0.8202: 100%|██████████| 200/200 [00:38<00:00,  5.20it/s]


Epoch 28, Training Loss: 0.6799, Validation Loss: 0.6737, Acc: 0.8012, Pressed Acc: 0.8690, Open Acc: 0.7764
Epoch 29:


Loss: 0.6819, Acc: 0.8495, Pressed Acc: 0.9249, Open Acc: 0.8205: 100%|██████████| 200/200 [00:37<00:00,  5.28it/s]


Epoch 29, Training Loss: 0.6819, Validation Loss: 0.6676, Acc: 0.8145, Pressed Acc: 0.8939, Open Acc: 0.7855
Epoch 30:


Loss: 0.6791, Acc: 0.8517, Pressed Acc: 0.9358, Open Acc: 0.8193: 100%|██████████| 200/200 [00:37<00:00,  5.26it/s]


Epoch 30, Training Loss: 0.6791, Validation Loss: 0.6629, Acc: 0.8283, Pressed Acc: 0.9080, Open Acc: 0.7992
