In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.transforms import ToTensor
import torchvision.transforms as transforms
import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt


In [11]:
class FramesDataset(Dataset):
    def __init__(self, frames_dir, class_num, transforms=None):
        self.frames_dir = frames_dir
        self.transforms = transforms
        self.class_num = class_num

        self.frames = [[os.path.join(frames_dir, file), file.split('.')[0].split('_')] for file in os.listdir(frames_dir)]
        
    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        frame_path = self.frames[idx][0]
        frame_tensor = self.transforms(self.load_frame(frame_path))
        labels = self.frames[idx][1]
        label_tensor = self.get_label(labels)
        return [frame_tensor, label_tensor]

    def load_frame(self, frame_path):
        frame = (Image.open(frame_path))
        return frame
    
    def get_label(self, label):
        tensor_label = torch.zeros(self.class_num)
        for i in range(len(label)-1):
            tensor_label[int(label[i])] = 1
        return tensor_label


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [13]:
# Define transforms for the images
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

dataset = FramesDataset('note_frames_multi/', 91, transforms=transform)
# train_loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2)

In [14]:
for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        print(labels.shape)
        break

torch.Size([4, 91])


In [15]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=91):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5)  # 3 input channels (RGB), 6 output channels, 5x5 kernel
        self.pool = nn.MaxPool2d(2, 2)  # Max pooling layer with kernel size 2x2 and stride 2
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)  # 6 input channels (output of previous conv layer), 16 output channels, 5x5 kernel
        self.fc1 = nn.Linear(16 * 53 * 53, 120)  # Fully connected layer with 16*53*53 input features (after convolutions and pooling), 120 output features
        self.fc2 = nn.Linear(120, 84)  # Fully connected layer with 120 input features, 84 output features
        self.fc3 = nn.Linear(84, num_classes)  # Fully connected layer with 84 input features, num_classes output features

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # Apply first convolution, ReLU activation, and max pooling
        x = self.pool(torch.relu(self.conv2(x)))  # Apply second convolution, ReLU activation, and max pooling
        x = x.view(-1, 16 * 53 * 53)  # Flatten the tensor for input to fully connected layers
        x = torch.relu(self.fc1(x))  # Apply first fully connected layer and ReLU activation
        x = torch.relu(self.fc2(x))  # Apply second fully connected layer and ReLU activation
        x = self.fc3(x) # Apply third fully connected layer (no activation)
        return x

In [19]:
# Create an instance of the model
model = SimpleCNN().to(device)

# Initialize the model, loss function, and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Train the model
epochs = 70
for epoch in range(epochs):  # Loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        # print(inputs)

        outputs = model(inputs)
        # print(outputs)
        # print(outputs, labels)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    avg_loss = loss.item()/i
    print(f'Epoch [{epoch+1}/{epochs}],Avg Loss: {avg_loss:.8f}')


print('Finished Training')

Epoch [1/70],Avg Loss: 0.00055267
Epoch [2/70],Avg Loss: 0.00029477
Epoch [3/70],Avg Loss: 0.00037859
Epoch [4/70],Avg Loss: 0.00028705
Epoch [5/70],Avg Loss: 0.00027784
Epoch [6/70],Avg Loss: 0.00022659
Epoch [7/70],Avg Loss: 0.00023617
Epoch [8/70],Avg Loss: 0.00026318
Epoch [9/70],Avg Loss: 0.00026187
Epoch [10/70],Avg Loss: 0.00013574
Epoch [11/70],Avg Loss: 0.00022198
Epoch [12/70],Avg Loss: 0.00011401
Epoch [13/70],Avg Loss: 0.00003610
Epoch [14/70],Avg Loss: 0.00008755
Epoch [15/70],Avg Loss: 0.00001261
Epoch [16/70],Avg Loss: 0.00008508
Epoch [17/70],Avg Loss: 0.00016111
Epoch [18/70],Avg Loss: 0.00005245
Epoch [19/70],Avg Loss: 0.00004245
Epoch [20/70],Avg Loss: 0.00006847
Epoch [21/70],Avg Loss: 0.00006431
Epoch [22/70],Avg Loss: 0.00008177
Epoch [23/70],Avg Loss: 0.00004188
Epoch [24/70],Avg Loss: 0.00002007
Epoch [25/70],Avg Loss: 0.00001795
Epoch [26/70],Avg Loss: 0.00007992
Epoch [27/70],Avg Loss: 0.00003170
Epoch [28/70],Avg Loss: 0.00000312
Epoch [29/70],Avg Loss: 0.000

KeyboardInterrupt: 

In [9]:
for i in range(91):
    frame = transform(Image.open(f'note_frames/{i}.png')).to(device)
    if i - torch.argmax(model(frame)) != 0:
        print(i)

11
