In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.transforms import ToTensor
import torchvision.transforms as transforms
from torchvision.models import resnet50, ResNet50_Weights

import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
class FramesDataset(Dataset):
    def __init__(self, frames_dir, class_num, transforms=None):
        self.frames_dir = frames_dir
        self.transforms = transforms
        self.class_num = class_num

        self.frames = [[os.path.join(frames_dir, file), file.split('.')[0].split('_')] for file in os.listdir(frames_dir)]
        
    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        frame_path = self.frames[idx][0]
        frame_tensor = self.transforms(self.load_frame(frame_path))
        labels = self.frames[idx][1]
        label_tensor = self.get_label(labels)
        return [frame_tensor, label_tensor]

    def load_frame(self, frame_path):
        frame = (Image.open(frame_path))
        return frame
    
    def get_label(self, label):
        tensor_label = torch.zeros(self.class_num)
        for i in range(len(label)):
            tensor_label[int(label[i])] = 1
        return tensor_label


In [70]:
def save(model_name, best_model, train_loss, test_loss, accuracy):
    path = os.path.join('results', model_name)
    if not os.path.exists(path):
        os.makedirs(path)

    np.save(os.path.join(path, 'train_loss.npy'), np.array(train_loss))
    np.save(os.path.join(path, 'test_loss.npy'), np.array(test_loss))
    np.save(os.path.join(path, 'accuracy.npy'), np.array(accuracy))
    torch.save(best_model, os.path.join(path, 'model.pt'))
    return

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [21]:
# Define transforms for the images
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

dataset = FramesDataset('note_frames_multi/', 91, transforms=transform)
# train_loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2)

In [71]:
class Classifier(nn.Module):
    def __init__(self, num_classes=91, encoder='simple'):
        super(Classifier, self).__init__()

        self.encoder_type = encoder

        if encoder == 'simple':
            self.encoder = nn.Sequential(
                nn.Conv2d(3, 6, kernel_size=5),
                nn.ReLU(),
                nn.MaxPool2d(2, 2), 
                nn.Conv2d(6, 16, kernel_size=5),
                nn.ReLU(),
                nn.MaxPool2d(2, 2), 
            )            

            self. decoder = nn.Sequential(
                nn.Linear(16 * 53 * 53, 120),
                nn.ReLU(),
                nn.Linear(120, 84),
                nn.ReLU(),
                nn.Linear(84, num_classes)
            )

        elif encoder == 'resnet50':
            pass
        
        

    def forward(self, x):
        x = self.encoder(x)  # Apply first convolution, ReLU activation, and max pooling
        x = x.view(-1, 16 * 53 * 53)  # Flatten the tensor for input to fully connected layers
        x = self.decoder(x) # Apply third fully connected layer (no activation)
        return x

In [59]:
# Create an instance of the model
model = Classifier().to(device)
best_model = model

# Initialize the model, loss function, and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Train the model
epochs = 100
best_val_loss = 1000
test_loss_all = []
train_loss_all = []
accuracy_all = []
for epoch in tqdm(range(epochs), unit='epoch'):  # Loop over the dataset multiple times

    model.train()
    train_loss = []
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())

    model.eval()
    test_loss = []
    correct = 0
    total = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            output = model(inputs)
            test_loss.append(criterion(output, labels).cpu().item())

            outputs = torch.sigmoid(outputs).cpu()
            predicted = np.round(outputs)
            total += labels.size(0)*labels.size(1)
            correct += (predicted == labels.cpu()).sum().item()
            
    accuracy = 100*correct/total
    accuracy_all.append(accuracy)
    print(f'Epoch [{epoch+1}/{epochs}],Accuracy: {accuracy:.4f}%')

    train_loss = np.mean(train_loss)
    test_loss = np.mean(test_loss)
    train_loss_all.append(train_loss)
    test_loss_all.append(test_loss)

    if best_val_loss > test_loss:
        best_val_loss = test_loss
        best_model = model
        best_epoch = epoch
        print(f'Epoch [{epoch+1}/{epochs}],Test Loss: {train_loss:.8f}')
        print(f'Epoch [{epoch+1}/{epochs}],Test Loss: {test_loss:.8f}')

    
print(f'Finished Training, best epoch: {best_epoch}')

  0%|          | 0/100 [00:00<?, ?epoch/s]

  1%|          | 1/100 [00:52<1:25:51, 52.03s/epoch]

Epoch [1/100],Accuracy: 4.4899%
Epoch [1/100],Test Loss: 0.15883846
Epoch [1/100],Test Loss: 0.11083203


  1%|          | 1/100 [01:25<2:21:52, 85.98s/epoch]


KeyboardInterrupt: 