In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.transforms import ToTensor
import torchvision.transforms as transforms
import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
class FramesDataset(Dataset):
    def __init__(self, frames_dir, class_num, transforms=None):
        self.frames_dir = frames_dir
        self.transforms = transforms
        self.class_num = class_num

        self.frames = [[os.path.join(frames_dir, file), file.split('.')[0].split('_')] for file in os.listdir(frames_dir)]
        
    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        frame_path = self.frames[idx][0]
        frame_tensor = self.transforms(self.load_frame(frame_path))
        labels = self.frames[idx][1]
        label_tensor = self.get_label(labels)
        return [frame_tensor, label_tensor]

    def load_frame(self, frame_path):
        frame = (Image.open(frame_path))
        return frame
    
    def get_label(self, label):
        tensor_label = torch.zeros(self.class_num)
        for i in range(len(label)):
            tensor_label[int(label[i])] = 1
        return tensor_label


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
# Define transforms for the images
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

dataset = FramesDataset('note_frames_multi/', 91, transforms=transform)
# train_loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2)

In [6]:
for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        print(labels.shape)
        break

torch.Size([4, 91])


In [7]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=91):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5)  # 3 input channels (RGB), 6 output channels, 5x5 kernel
        self.pool = nn.MaxPool2d(2, 2)  # Max pooling layer with kernel size 2x2 and stride 2
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)  # 6 input channels (output of previous conv layer), 16 output channels, 5x5 kernel
        self.fc1 = nn.Linear(16 * 53 * 53, 120)  # Fully connected layer with 16*53*53 input features (after convolutions and pooling), 120 output features
        self.fc2 = nn.Linear(120, 84)  # Fully connected layer with 120 input features, 84 output features
        self.fc3 = nn.Linear(84, num_classes)  # Fully connected layer with 84 input features, num_classes output features

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # Apply first convolution, ReLU activation, and max pooling
        x = self.pool(torch.relu(self.conv2(x)))  # Apply second convolution, ReLU activation, and max pooling
        x = x.view(-1, 16 * 53 * 53)  # Flatten the tensor for input to fully connected layers
        x = torch.relu(self.fc1(x))  # Apply first fully connected layer and ReLU activation
        x = torch.relu(self.fc2(x))  # Apply second fully connected layer and ReLU activation
        x = self.fc3(x) # Apply third fully connected layer (no activation)
        return x

In [21]:
# Create an instance of the model
model = SimpleCNN().to(device)
best_model = model

# Initialize the model, loss function, and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Train the model
epochs = 100
best_val_loss = 1000
for epoch in tqdm(range(epochs), unit='epoch'):  # Loop over the dataset multiple times

    model.train()
    train_loss = []
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())

    model.eval()
    test_loss = []
    for i, data in enumerate(test_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            output = model(inputs)
        
        test_loss.append(criterion(output, labels).cpu().item())

    train_loss = np.mean(train_loss)
    test_loss = np.mean(test_loss)
    # print(f'Epoch [{epoch+1}/{epochs}],Test Loss: {train_loss:.8f}')
    # print(f'Epoch [{epoch+1}/{epochs}],Test Loss: {test_loss:.8f}')
    if best_val_loss > test_loss:
        best_val_loss = test_loss
        best_model = model
        best_epoch = epoch
        print(f'Epoch [{epoch+1}/{epochs}],Test Loss: {train_loss:.8f}')
        print(f'Epoch [{epoch+1}/{epochs}],Test Loss: {test_loss:.8f}')


print(f'Finished Training, best epoch: {best_epoch}')

  0%|          | 0/100 [00:00<?, ?epoch/s]

  1%|          | 1/100 [00:02<03:21,  2.03s/epoch]

Epoch [1/100],Test Loss: 0.21423651
Epoch [1/100],Test Loss: 0.15071224


  2%|▏         | 2/100 [00:04<03:20,  2.05s/epoch]

Epoch [2/100],Test Loss: 0.13096102
Epoch [2/100],Test Loss: 0.12141052


  3%|▎         | 3/100 [00:06<03:17,  2.04s/epoch]

Epoch [3/100],Test Loss: 0.10931609
Epoch [3/100],Test Loss: 0.10897175


  4%|▍         | 4/100 [00:08<03:15,  2.04s/epoch]

Epoch [4/100],Test Loss: 0.09845803
Epoch [4/100],Test Loss: 0.10409175


  5%|▌         | 5/100 [00:10<03:12,  2.02s/epoch]

Epoch [5/100],Test Loss: 0.09154177
Epoch [5/100],Test Loss: 0.09812926


  6%|▌         | 6/100 [00:12<03:10,  2.03s/epoch]

Epoch [6/100],Test Loss: 0.08526246
Epoch [6/100],Test Loss: 0.09137511


  7%|▋         | 7/100 [00:14<03:07,  2.02s/epoch]

Epoch [7/100],Test Loss: 0.07829693
Epoch [7/100],Test Loss: 0.08751514


  8%|▊         | 8/100 [00:16<03:06,  2.02s/epoch]

Epoch [8/100],Test Loss: 0.07119254
Epoch [8/100],Test Loss: 0.08047289


  9%|▉         | 9/100 [00:18<03:03,  2.01s/epoch]

Epoch [9/100],Test Loss: 0.06394909
Epoch [9/100],Test Loss: 0.07525681


 10%|█         | 10/100 [00:20<03:01,  2.02s/epoch]

Epoch [10/100],Test Loss: 0.05799600
Epoch [10/100],Test Loss: 0.07092502


 11%|█         | 11/100 [00:22<03:00,  2.03s/epoch]

Epoch [11/100],Test Loss: 0.05289765
Epoch [11/100],Test Loss: 0.06705735


 12%|█▏        | 12/100 [00:24<02:59,  2.04s/epoch]

Epoch [12/100],Test Loss: 0.04795921
Epoch [12/100],Test Loss: 0.06568427


 13%|█▎        | 13/100 [00:26<02:57,  2.04s/epoch]

Epoch [13/100],Test Loss: 0.04443617
Epoch [13/100],Test Loss: 0.06377606


 14%|█▍        | 14/100 [00:28<02:54,  2.03s/epoch]

Epoch [14/100],Test Loss: 0.04127703
Epoch [14/100],Test Loss: 0.06358628


 15%|█▌        | 15/100 [00:30<02:52,  2.03s/epoch]

Epoch [15/100],Test Loss: 0.03779469
Epoch [15/100],Test Loss: 0.06187627


 17%|█▋        | 17/100 [00:34<02:48,  2.03s/epoch]

Epoch [17/100],Test Loss: 0.03314985
Epoch [17/100],Test Loss: 0.05900122


100%|██████████| 100/100 [03:22<00:00,  2.02s/epoch]

Finished Training, best epoch: 16



