In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.transforms import ToTensor
import torchvision.transforms as transforms
import torchvision.models as models

import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
class FramesDataset(Dataset):
    def __init__(self, frames_dir, class_num, transforms=None):
        self.frames_dir = frames_dir
        self.transforms = transforms
        self.class_num = class_num

        self.frames = [[os.path.join(frames_dir, file), file.split('.')[0].split('_')] for file in os.listdir(frames_dir)]
        
    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        frame_path = self.frames[idx][0]
        frame_tensor = self.transforms(self.load_frame(frame_path))
        labels = self.frames[idx][1]
        label_tensor = self.get_label(labels)
        return [frame_tensor, label_tensor]

    def load_frame(self, frame_path):
        frame = (Image.open(frame_path))
        return frame
    
    def get_label(self, label):
        tensor_label = torch.zeros(self.class_num)
        for i in range(len(label)):
            tensor_label[int(label[i])] = 1
        return tensor_label


In [3]:
def save(model_name, best_model, train_loss, test_loss, accuracy):
    path = os.path.join('results', model_name)
    if not os.path.exists(path):
        os.makedirs(path)

    np.save(os.path.join(path, 'train_loss.npy'), np.array(train_loss))
    np.save(os.path.join(path, 'test_loss.npy'), np.array(test_loss))
    np.save(os.path.join(path, 'accuracy.npy'), np.array(accuracy))
    torch.save(best_model, os.path.join(path, 'model.pt'))
    return

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
# Define transforms for the images
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

dataset = FramesDataset('note_frames_multi/', 91, transforms=transform)
# train_loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=4)

In [12]:
class Classifier(nn.Module):
    def __init__(self, num_classes=91, encoder='simple', pretrained=False):
        super(Classifier, self).__init__()

        self.encoder_type = encoder

        if encoder == 'simple':
            self.encoder = nn.Sequential(
                nn.Conv2d(3, 6, kernel_size=5),
                nn.ReLU(),
                nn.MaxPool2d(2, 2), 
                nn.Conv2d(6, 16, kernel_size=5),
                nn.ReLU(),
                nn.MaxPool2d(2, 2), 
                nn.Flatten()
            )            

            self. decoder = nn.Sequential(
                nn.Linear(16 * 53 * 53, 120),
                nn.ReLU(),
                nn.Linear(120, num_classes)
            )

        else:
            if encoder == 'resnet50':
                self.encoder = models.resnet50(weights='IMAGENET1K_V1')
            elif encoder == 'resnet18':
                self.encoder = models.resnet18(weights='IMAGENET1K_V1')
            elif encoder == 'vit16':
                self.encoder = models.vit_b_16(weights='IMAGENET1K_V1')

            self.decoder = nn.Sequential(
                nn.Linear(1000, 120),
                nn.ReLU(),
                nn.Linear(120, num_classes)
            )
        
        

    def forward(self, x):
        x = self.encoder(x)  # Apply first convolution, ReLU activation, and max pooling
        x = self.decoder(x) # Apply third fully connected layer (no activation)
        return x

In [7]:
def train(model, criterion, optimizer, epochs, train_loader, test_loader):
    # Train the model
    best_model = model
    criterion = criterion
    optimizer = optimizer

    epochs = epochs
    best_val_loss = 1000
    test_loss_all = []
    train_loss_all = []
    accuracy_all = []
    for epoch in tqdm(range(epochs), unit='epoch'):  # Loop over the dataset multiple times

        model.train()
        train_loss = []
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            output = model(inputs)
            loss = criterion(output, labels)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        model.eval()
        test_loss = []
        correct = 0
        total = 0
        with torch.no_grad():
            for i, data in enumerate(test_loader, 0):
                inputs, labels = data
                inputs = inputs.to(device)
                labels = labels.to(device)

                output = model(inputs)
                test_loss.append(criterion(output, labels).cpu().item())

                outputs = torch.sigmoid(output).cpu()
                predicted = np.round(outputs)
                total += labels.size(0)*labels.size(1)
                correct += (predicted == labels.cpu()).sum().item()
                
        accuracy = 100*correct/total
        accuracy_all.append(accuracy)
        print(f'Epoch [{epoch+1}/{epochs}],Accuracy: {accuracy:.4f}%')

        train_loss = np.mean(train_loss)
        test_loss = np.mean(test_loss)
        train_loss_all.append(train_loss)
        test_loss_all.append(test_loss)

        if best_val_loss > test_loss:
            best_val_loss = test_loss
            best_model = model
            best_epoch = epoch
            print(f'Epoch [{epoch+1}/{epochs}],Test Loss: {train_loss:.8f}')
            print(f'Epoch [{epoch+1}/{epochs}],Test Loss: {test_loss:.8f}')

        
    print(f'Finished Training, best epoch: {best_epoch}')
    return best_model, train_loss_all, test_loss_all, accuracy_all

## Train simple CNN model

In [14]:
# Create an instance of the model
model = Classifier().to(device)

# Initialize the model, loss function, and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

best_model, train_loss, test_loss, accuracy = train(model, criterion, optimizer, 50, train_loader, test_loader)
save('simpleCNN', best_model, train_loss, test_loss, accuracy)

(torch.Size([64, 91]),
 torch.Size([32, 91]),
 torch.Size([32, 91]),
 torch.Size([64, 3, 224, 224]))

## Train Resnet18

In [15]:
# Create an instance of the model
model = Classifier(encoder='resnet18').to(device)

# Initialize the model, loss function, and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

best_model, train_loss, test_loss, accuracy = train(model, criterion, optimizer, 20, train_loader, test_loader)
save('resnet18', best_model, train_loss, test_loss, accuracy)

  5%|▌         | 1/20 [01:26<27:24, 86.57s/epoch]

Epoch [1/20],Accuracy: 98.3157%
Epoch [1/20],Test Loss: 0.13312550
Epoch [1/20],Test Loss: 0.05470844


  5%|▌         | 1/20 [02:28<46:53, 148.05s/epoch]


KeyboardInterrupt: 

## Train Resnet50

In [None]:
# Create an instance of the model
model = Classifier(encoder='resnet50').to(device)

# Initialize the model, loss function, and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

best_model, train_loss, test_loss, accuracy = train(model, criterion, optimizer, 20, train_loader, test_loader)
save('resnet50', best_model, train_loss, test_loss, accuracy)

 10%|█         | 1/10 [01:25<12:45, 85.00s/epoch]

Epoch [1/10],Accuracy: 98.3054%
Epoch [1/10],Test Loss: 0.13042646
Epoch [1/10],Test Loss: 0.05699795


 20%|██        | 2/10 [02:51<11:25, 85.68s/epoch]

Epoch [2/10],Accuracy: 99.2680%
Epoch [2/10],Test Loss: 0.05774771
Epoch [2/10],Test Loss: 0.02750220


 30%|███       | 3/10 [04:17<10:02, 86.04s/epoch]

Epoch [3/10],Accuracy: 99.4688%
Epoch [3/10],Test Loss: 0.03750972
Epoch [3/10],Test Loss: 0.02021180


 40%|████      | 4/10 [05:47<08:46, 87.73s/epoch]

Epoch [4/10],Accuracy: 99.6237%
Epoch [4/10],Test Loss: 0.02793906
Epoch [4/10],Test Loss: 0.01460292


 50%|█████     | 5/10 [07:13<07:14, 86.87s/epoch]

Epoch [5/10],Accuracy: 99.6611%
Epoch [5/10],Test Loss: 0.02266348
Epoch [5/10],Test Loss: 0.01164601


 60%|██████    | 6/10 [08:38<05:45, 86.26s/epoch]

Epoch [6/10],Accuracy: 99.6910%
Epoch [6/10],Test Loss: 0.01922264
Epoch [6/10],Test Loss: 0.01085459


 70%|███████   | 7/10 [10:03<04:17, 85.78s/epoch]

Epoch [7/10],Accuracy: 99.6910%
Epoch [7/10],Test Loss: 0.01708371
Epoch [7/10],Test Loss: 0.01055596


 80%|████████  | 8/10 [11:31<02:53, 86.59s/epoch]

Epoch [8/10],Accuracy: 99.7638%
Epoch [8/10],Test Loss: 0.01505772
Epoch [8/10],Test Loss: 0.00751005


 90%|█████████ | 9/10 [12:57<01:26, 86.48s/epoch]

Epoch [9/10],Accuracy: 99.7797%
Epoch [9/10],Test Loss: 0.01330870
Epoch [9/10],Test Loss: 0.00706917


100%|██████████| 10/10 [14:26<00:00, 86.70s/epoch]

Epoch [10/10],Accuracy: 99.7685%
Finished Training, best epoch: 8





## Train ViT16

In [None]:
# Create an instance of the model
model = Classifier(encoder='vit16').to(device)

# Initialize the model, loss function, and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

best_model, train_loss, test_loss, accuracy = train(model, criterion, optimizer, 10, train_loader, test_loader)
save('vit16', best_model, train_loss, test_loss, accuracy)

  0%|          | 0/10 [08:13<?, ?epoch/s]


KeyboardInterrupt: 