In [None]:
import pandas as pd
from io import StringIO
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, ConcatDataset, SubsetRandomSampler
from torchvision.transforms import ToTensor, Compose
from torchvision.transforms import ToTensor, Compose, Normalize, Resize, ConvertImageDtype
from torchvision.transforms import RandomHorizontalFlip, RandomPerspective, \
    ColorJitter, GaussianBlur, RandomAffine
import torch
from transformers import BeitForImageClassification
from PIL import Image
import copy

In [None]:
# TO FIX DOWNLOADING ERRORS 
import requests
import ssl

requests.packages.urllib3.disable_warnings()
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

In [None]:
def extract_labels(df):
    # Total of 29996 samples in train set
    # Range 1-19 (12 missing)
    # 18 classes
    # output one-hot vector (18-dim) with at least one "1"
    labels = df["Labels"]
    num_samples = len(labels)
    num_classes = 18
    Y_data = np.zeros((num_samples, num_classes))
    for i in range(len(labels)):
        label = labels[i]
        label_ls = label.split(" ")
        label_ints = [int(label_str) for label_str in label_ls]
        for idx in label_ints:
            if idx >= 12:
                idx -= 1
            Y_data[i][idx-1] = 1  
            
    Y_data = Y_data.reshape((29996, 1, 1, 18))
    return Y_data

In [None]:
def get_train_labels():
    try:
        FILENAME = "./COMP5329S1A2Dataset/train.csv"
        with open(FILENAME) as file:
            lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
            df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")
    except FileNotFoundError:
        print(f"File {FILENAME} not found")
        exit()

    Y_data = extract_labels(df)
    return Y_data

In [None]:
def get_img_ids(train):
    try:
        if train:
            FILENAME = "./COMP5329S1A2Dataset/train.csv"
        else:
            FILENAME = "./COMP5329S1A2Dataset/test.csv"
        with open(FILENAME) as file:
            lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
            df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")
    except FileNotFoundError:
        print(f"File {FILENAME} not found")
        exit()

    # There are actually 30000 images in the train folder
    # but we only take a select number of them due to 
    # train_df only having 29996 entries
    img_ids = df["ImageID"]
    return img_ids

In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, train, img_transform=None, target_transform=None):
        self.train = train

        self.img_ids = get_img_ids(train=train)

        if train:
            self.labels = get_train_labels()
        else:
            self.labels = None
        
        self.img_transform = img_transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.img_ids)
    
    def __getitem__(self, idx):
        if self.train:
            file_path = "./COMP5329S1A2Dataset/data/train/" + self.img_ids[idx]
        else:
            file_path = "./COMP5329S1A2Dataset/data/test/" + self.img_ids[idx]
        img = Image.open(file_path)
        
        if self.img_transform is not None:
            img = self.img_transform(img)
        
        if self.labels is not None:
            label = self.labels[idx]        
            label = self.target_transform(label)
            label = label.view(18).type(torch.float32) 
        else:
            label = -1
            
        return img, label

In [None]:
def get_pos_weight(dataset, device):
    assert len(dataset.datasets) == 2
    train_labels = dataset.datasets[0].labels.repeat(2, axis=0)

    # Normalized inverse frequency of occurrences of labels
    classes = train_labels.sum(axis=0)
    all_samples = train_labels.sum()
    inv_classes = all_samples / classes
    min = np.min(inv_classes); max = np.max(inv_classes)
    lower_bound = 1; upper_bound = 10
    inv_classes_norm = (inv_classes-min)/(max-min) * (upper_bound-lower_bound) + lower_bound 
    pos_weight = torch.from_numpy(inv_classes_norm)

    return pos_weight.type(torch.float32).view(18).to(device)

In [None]:
def evaluate(model, dataloader, device, criterion, mode="img"):
    loss = 0
    corr = 0
    
    model.eval()
    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)
            output = model(imgs).logits
            preds = torch.round(torch.sigmoid(output))

            loss += criterion(output, labels) * imgs.size(0)
            corr += preds.eq(labels).all(dim=1).sum() # checks whether all samples equal

    loss = loss.item() / len(dataloader.dataset)
    acc = corr.item() / len(dataloader.dataset)
    
    return loss, acc

In [None]:
train_transform = Compose([
    Resize((224, 224), interpolation=3),
    RandomHorizontalFlip(p=0.5),
    RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    RandomPerspective(p=0.5, distortion_scale=0.5, interpolation=3),
    ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
    GaussianBlur(kernel_size=23, sigma=(0.1, 2.0)),
    ToTensor(),
    ConvertImageDtype(torch.float32),
    Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

test_transform = Compose([
    Resize((224, 224), interpolation=3),
    ToTensor(),
    ConvertImageDtype(torch.float32),
    Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [None]:
# Split into train, val, test datasets - 49992, 10000, 10000 samples
train_set_normal = MultimodalDataset(train=True, img_transform=test_transform, target_transform=ToTensor())
train_set_augmented = MultimodalDataset(train=True, img_transform=train_transform, target_transform=ToTensor())
train_set_combined = ConcatDataset([train_set_normal, train_set_augmented])
test_set = MultimodalDataset(train=False, img_transform=test_transform)

batch_size = 512
dataset_size = len(train_set_combined)
train_size = int((1-len(test_set)/dataset_size) * dataset_size) 

indices = list(range(dataset_size))
np.random.shuffle(indices) 
train_indices, val_indices = indices[:train_size], indices[train_size:]

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_loader = DataLoader(train_set_combined, batch_size=batch_size, sampler=train_sampler) # TODO: how to add shuffle
val_loader = DataLoader(train_set_combined, batch_size=batch_size, sampler=val_sampler)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [None]:
if torch.cuda.is_available():
    type_device = "cuda"
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    type_device = "mps"
else:
    type_device = "cpu"
    
# type_device = "cpu" # might need to override, depending on computer
print(f"Using {type_device} device")
device = torch.device(type_device)

In [None]:
model

In [None]:
# https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k

model = BeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
model = model.to(device)

for param in model.parameters():
    param.requires_grad = False

model.classifier = nn.Sequential(
    nn.Linear(in_features=768, out_features=256), # ie. 512
    nn.BatchNorm1d(num_features=256),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(in_features=256, out_features=128),
    nn.BatchNorm1d(num_features=128),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(in_features=128, out_features=64),
    nn.BatchNorm1d(num_features=64),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(in_features=64, out_features=18)
)

In [None]:
learning_rate = 0.0001 # before 1e-3
num_epochs = 100
losses = []
best_loss = 1e7
best_model = None

pos_weight = get_pos_weight(train_set_combined, device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
model.load_state_dict(torch.load("./best_model.pth"))

In [None]:
model.train()
for epoch in range(num_epochs):  
    print(f"Epoch {epoch}")
    
    for batch, (imgs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        losses.append(loss.item()) 
        if batch % 10 == 0:
            num_batches = len(train_loader)
            print('batch: [%d/%d], loss: %.5f' %(batch, num_batches, loss.item()))           
            
    val_loss, val_acc = evaluate(model, val_loader, device, criterion)
    if val_loss < best_loss:
        best_loss = val_loss
        best_model = copy.deepcopy(model)
        torch.save(best_model.state_dict(), f"./saved/model_{epoch}_{val_loss:.4f}_{val_acc:.4f}.pth")

In [None]:
torch.save(model.state_dict(), "./saved_model.pth")

In [None]:
plt.plot(losses)
plt.title("Multimodal Network (ResNet18 + BiLSTM)")
plt.xlabel("Batch")
plt.ylabel("Loss")

train_loss, train_acc = evaluate(model, train_loader, device, criterion)
val_loss, val_acc = evaluate(model, val_loader, device, criterion)
print(f"train loss: {train_loss}\ttrain acc: {train_acc}")
print(f"val loss: {val_loss}\tval acc: {val_acc}")

In [176]:
outputs = []
model.eval()
with torch.no_grad():
     # shape (40, 256, 18) except for last batch which has shape (16, 18)
    print(f"Num batches: {len(test_loader)}")
    for batch, (imgs, _) in enumerate(test_loader):
        print(f"Batch {batch}")
        imgs = imgs.to(device)
        output = model(imgs).logits
        preds = torch.round(torch.sigmoid(output))
        outputs.append(preds) 
saved_outputs = copy.deepcopy(outputs)

In [None]:
temp = torch.zeros((batch_size,18))
remaining = outputs[0].shape[0] % batch_size
# can replace "remaining" directly with 272 (if batch_size=512) or 16 (if batch_size=256)
temp[:remaining] = outputs.pop() 
outputs.append(temp.to(device)) 
new_outputs = torch.stack(outputs, dim=0) # turn into tensor 
new_outputs = new_outputs.view(-1, 18) 
new_outputs = new_outputs[:10000] # now shape (10000, 18)

In [None]:
sums = new_outputs.sum(axis=0)
for i in range(18):
    print(f"label: {i+1}\tnum preds: {sums[i]}")

In [None]:
label_strs = []
for sample in new_outputs:
    idxs = torch.where(sample == 1)[0]
    idxs_ls = [str(idx.item()+1) for idx in idxs] # has strs "1"-"18"
    for i in range(len(idxs_ls)):
        idx = int(idxs_ls[i])
        if idx >= 12: # shift 7 numbers over by 1 to right (since 12 has no labels)
            idxs_ls[i] = str(idx+1)
    label_strs.append(" ".join(idxs_ls))
label_strs = np.array(label_strs)

In [None]:
FILENAME = "./COMP5329S1A2Dataset/test.csv"
with open(FILENAME) as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
    test_df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")
if "Caption" in test_df.columns:
    test_df = test_df.drop("Caption", axis=1)
test_df["Labels"] = label_strs
test_df.to_csv("./model_eval.csv", sep=",", index=False)