### Demo Code for Model 1: Self-Ensembling ViT

Import all the required libraries for setting up data, model training, and validation. 

In [None]:
import torch
import timm
import pandas as pd
import os
import gdown

from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image

Set all necessary constants to be used in setting up the test dataset and testing the model.

In [None]:
MODEL_PATH = "model1_vit_ensemble.pth"
MODEL_URL = "https://drive.google.com/uc?id=11y4GhlkjiE2GFcVmuO-E3O3p89BawFNF"

IMAGE_PATH = "banana_test"

NUMBER_OF_CLASSES = 4

NORMALIZE_MEAN = [0.485, 0.456, 0.406]
NORMALIZE_STD = [0.229, 0.224, 0.225]

Start by loading a non-pretrained Vision Transformer (ViT) architecture and initializing it with the weights (downloaded from Google) from the final trained model.

Next, define the test images transforms: resize the input images, convert them to tensors, and normalize them, to ensure compatibility with the model’s expected input format.

An inference dataset object is then defined to handle the unlabeled test images. It automatically applies the defined transforms when samples are queried.

Once set up is done, a DataLoader is instantiated using this prepared dataset. The model generates predictions, which are then formatted according to the specified CSV submission requirements. Finally, the predictions are saved to a .csv file.

In [None]:
# Select computing device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a non-pretrained Vision Transformer (ViT), with final trained model weights.
model = timm.create_model("vit_base_patch16_224", pretrained=False, num_classes=NUMBER_OF_CLASSES)
gdown.download(MODEL_URL, MODEL_PATH, quiet=False)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()

# Test images transforms: resize, convert to tensors, and normalize.
test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=NORMALIZE_MEAN, 
        std=NORMALIZE_STD
    ) 
])

# Define an inference dataset object to handle unlabeled datasets. Transformed images are returned
# when samples are queried.
class InferenceDataset(Dataset):
    def __init__(self, image_folder, transform=None):
        self.image_folder = image_folder
        self.image_files = sorted([
            f for f in os.listdir(image_folder)
            if f.lower().endswith((".jpg", ".jpeg", ".png"))
        ])
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_name = self.image_files[idx]
        image_path = os.path.join(self.image_folder, image_name)
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, image_name
    
# Create augmented dataset and loader for batch processing.
test_dataset = InferenceDataset(IMAGE_PATH, transform=test_transforms)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

image_filenames = []
predicted_labels = []
labels = ["cordana", "healthy", "pestalotiopsis", "sigatoka"]

# Create predictions and store them in lists
with torch.no_grad():
    for inputs, filenames in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)

        image_filenames.extend([(f"image_{filename}") for filename in filenames])
        predicted_labels.extend([labels[p.item()] for p in preds])

# Convert lists to dataframe and save result to a .csv file
submission_df = pd.DataFrame({
    "image_filename": image_filenames,
    "predicted_label": predicted_labels
})
submission_df.to_csv("model1_output.csv", index=False)

os.remove(MODEL_PATH)


### Validation Code for Model 1: Self-Ensembling ViT

In [None]:
import timm
import torch
import gdown
import os

from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

In [None]:
MANUAL_SEED = 27
NUMBER_OF_CLASSES = 4

MODEL_PATH = "model1_vit_ensemble.pth"
MODEL_URL = "https://drive.google.com/uc?id=11y4GhlkjiE2GFcVmuO-E3O3p89BawFNF"

RAW_DATA_DIR = "training_data"

NORMALIZE_MEAN = [0.485, 0.456, 0.406]
NORMALIZE_STD = [0.229, 0.224, 0.225]

In [None]:
# Load dataset
base_dataset = datasets.ImageFolder(root=RAW_DATA_DIR)      

# Determine subset sizes
train_size = int(0.8*len(base_dataset))
validation_size = len(base_dataset)-train_size

# Split dataset into subsets randomly (with a pre-determined seed)
generator = torch.Generator().manual_seed(MANUAL_SEED)
_, validation_dataset = random_split(base_dataset, [train_size, validation_size], generator=generator)

In [None]:
# Apply transformations to validation images to match the model’s input format.
validation_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=NORMALIZE_MEAN, 
        std=NORMALIZE_STD
    ),
])

validation_dataset = [(validation_transform(image), label) for image, label in validation_dataset]
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False)

In [None]:
# Select computing device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a non-pretrained Vision Transformer (ViT), with final trained model weights.
model = timm.create_model("vit_base_patch16_224", pretrained=False, num_classes=NUMBER_OF_CLASSES)
gdown.download(MODEL_URL, MODEL_PATH, quiet=False)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()

# Test images transforms: resize, convert to tensors, and normalize.
test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=NORMALIZE_MEAN, 
        std=NORMALIZE_STD
    ) 
])

all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, labels_batch in validation_loader:
        inputs = inputs.to(device)
        labels_batch = labels_batch.to(device)

        outputs = model(inputs)
        _, predictions = torch.max(outputs, 1)

        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels_batch.cpu().numpy())

target_names = ["cordana", "healthy", "pestalotiopsis", "sigatoka"]
print(classification_report(all_labels, all_predictions, target_names=target_names, zero_division=0))

os.remove(MODEL_PATH)