In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.transforms import ToTensor
import torchvision.transforms as transforms
import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt


In [2]:
class FramesDataset(Dataset):
    def __init__(self, frames_dir, transforms=None):
        self.frames_dir = frames_dir
        self.transforms = transforms

        self.frames = [[os.path.join(frames_dir, file), file.split('.')[0]] for file in os.listdir(frames_dir)]
    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        frame_path = self.frames[idx][0]
        frame_tensor = self.transforms(self.load_frame(frame_path))
        label = int(self.frames[idx][1])
        label_tensor = self.get_label(label)
        return [frame_tensor, label_tensor]

    def load_frame(self, frame_path):
        frame = (Image.open(frame_path))
        return frame
    
    def get_label(self, label):
        tensor_label = torch.zeros(len(self.frames))
        tensor_label[label] = 1
        return tensor_label


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  return torch._C._cuda_getDeviceCount() > 0


device(type='cpu')

In [4]:
# Define transforms for the images
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

dataset = FramesDataset('note_frames/', transforms=transform)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2)

# train_size = int(0.8 * len(dataset))
# test_size = len(dataset) - train_size
# train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)
# test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2)

In [5]:
for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        print(labels)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,


tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,


In [6]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=91):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5)  # 3 input channels (RGB), 6 output channels, 5x5 kernel
        self.pool = nn.MaxPool2d(2, 2)  # Max pooling layer with kernel size 2x2 and stride 2
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)  # 6 input channels (output of previous conv layer), 16 output channels, 5x5 kernel
        self.fc1 = nn.Linear(16 * 53 * 53, 120)  # Fully connected layer with 16*53*53 input features (after convolutions and pooling), 120 output features
        self.fc2 = nn.Linear(120, 84)  # Fully connected layer with 120 input features, 84 output features
        self.fc3 = nn.Linear(84, num_classes)  # Fully connected layer with 84 input features, num_classes output features

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # Apply first convolution, ReLU activation, and max pooling
        x = self.pool(torch.relu(self.conv2(x)))  # Apply second convolution, ReLU activation, and max pooling
        x = x.view(-1, 16 * 53 * 53)  # Flatten the tensor for input to fully connected layers
        x = torch.relu(self.fc1(x))  # Apply first fully connected layer and ReLU activation
        x = torch.relu(self.fc2(x))  # Apply second fully connected layer and ReLU activation
        x = self.fc3(x) # Apply third fully connected layer (no activation)
        return x

In [7]:
# Create an instance of the model
model = SimpleCNN().to(device)

# Initialize the model, loss function, and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Train the model
epochs = 70
for epoch in range(epochs):  # Loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        # print(inputs)

        outputs = model(inputs)
        # print(outputs)
        # print(outputs, labels)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()

        # running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


print('Finished Training')

Epoch [1/70], Batch [1/23], Loss: 4.5263
Epoch [1/70], Batch [2/23], Loss: 4.5813
Epoch [1/70], Batch [3/23], Loss: 4.5680
Epoch [1/70], Batch [4/23], Loss: 4.3612
Epoch [1/70], Batch [5/23], Loss: 4.5101
Epoch [1/70], Batch [6/23], Loss: 4.7522
Epoch [1/70], Batch [7/23], Loss: 4.6027
Epoch [1/70], Batch [8/23], Loss: 4.5977
Epoch [1/70], Batch [9/23], Loss: 4.7653
Epoch [1/70], Batch [10/23], Loss: 4.7401
Epoch [1/70], Batch [11/23], Loss: 4.4692
Epoch [1/70], Batch [12/23], Loss: 4.5122
Epoch [1/70], Batch [13/23], Loss: 4.5056
Epoch [1/70], Batch [14/23], Loss: 4.5944
Epoch [1/70], Batch [15/23], Loss: 4.6104
Epoch [1/70], Batch [16/23], Loss: 4.5901
Epoch [1/70], Batch [17/23], Loss: 4.4919
Epoch [1/70], Batch [18/23], Loss: 4.7238
Epoch [1/70], Batch [19/23], Loss: 4.5707
Epoch [1/70], Batch [20/23], Loss: 4.6367
Epoch [1/70], Batch [21/23], Loss: 4.6111
Epoch [1/70], Batch [22/23], Loss: 4.5718
Epoch [1/70], Batch [23/23], Loss: 4.6990
Epoch [2/70], Batch [1/23], Loss: 4.4629
Ep

KeyboardInterrupt: 

In [None]:
for i in range(91):
    frame = transform(Image.open(f'note_frames/{i}.png')).to(device)
    print(i - torch.argmax(model(frame)))

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
