In [None]:
# util imports
import os
import random
import numpy as np

# pytorch imports
import torch
import torch.nn as nn
import torchvision
import torchvision.models as models
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.models import vit_b_16, ViT_B_16_Weights

In [None]:
# set seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(42)

# define device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# path to train data
train_path = os.path.join('unzipped_folder', 'train', 'train')
print(train_path)

In [None]:
# from google.colab import drive
# import zipfile
# import os

# # Mount Google Drive
# drive.mount('/content/drive')

# # Define the path to your ZIP file in Google Drive
# zip_path = os.path.join("drive", "My Drive", "CSE-144-Final-Dataset", "ucsc-cse-144-winter-2025-final-project.zip")

# # Extract the ZIP file
# extract_path = "/content/unzipped_folder"
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_path)

# print(f"ZIP file extracted to: {extract_path}")

# # List extracted files
# print(os.listdir(extract_path))

train_path = "./data/train/"

In [None]:
# composite the transforms
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),

#     # this should theoretically be wtvr the pretrained model was trained on
#     transforms.Normalize(mean=(0.48, 0.48, 0.48), std=(0.0039, 0.0039, 0.0039))
# ])
transform = transforms.Compose([
    transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    # transforms.RandomRotation(20),
    # transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), shear=10),
    # transforms.RandomPerspective(distortion_scale=0.1, p=0.2, interpolation=2),  # Perspective transform
    # transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.15, hue=0.075),
    # transforms.RandomGrayscale(p=0.2),
    transforms.RandAugment(num_ops=3, magnitude=6),  # RandAugment
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=(0.229, 0.224, 0.225)),
])


trainset = datasets.ImageFolder(root=train_path, transform=transform)

In [None]:
# For matrix operations
import numpy as np

# Data visualizaton.
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import random as rn

fig, ax = plt.subplots(2, 6)
fig.set_size_inches(6, 6)

for i in range(2):
    for j in range(6):
        l=rn.randint(0, len(trainset))
        ax[i, j].imshow(np.transpose(trainset[l][0].numpy(), (1, 2, 0)), cmap='gray')
        ax[i, j].set_title('Label: ' + str(trainset[l][1]))
        # Hide grid lines
        ax[i, j].grid(False)
        # Hide axes ticks
        ax[i, j].set_xticks([])
        ax[i, j].set_yticks([])

plt.tight_layout()

In [None]:
# put the trainset into a dataloader
train_loader = DataLoader(trainset, batch_size=16, shuffle=True, num_workers=2)

class ViT_B16_Head(nn.Module):
    def __init__(self, num_classes=100):
        super(ViT_B16_Head, self).__init__()
        self.base_model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
        for param in self.base_model.parameters():
            param.requires_grad = False
        
        # encoder_layer = nn.TransformerEncoderLayer(
        #     d_model=768,  # Embedding dimension
        #     nhead=12,     # Number of attention heads
        #     dim_feedforward=3072,  # Hidden layer size in FFN
        #     dropout=0.1,  # Dropout rate
        #     activation='gelu'  # Activation function used in FFN
        # )
    
        # self.base_model.encoder.layers.extend([encoder_layer, encoder_layer])

        self.base_model.heads = nn.Sequential(
            nn.Dropout(0.3),                      # Reduced dropout to retain more information
            nn.Linear(768, 2048),                  
            nn.BatchNorm1d(2048),                  # Normalization for stable training
            nn.LeakyReLU(),                         
            nn.Linear(2048, num_classes)           # Output layer (no softmax for training)
        )
        
        for param in self.base_model.heads.parameters():
            param.requires_grad = True
            
        # for name, param in self.base_model.named_parameters():
        #     # if "encoder.layer.10" in name or "encoder.layer.11" in name:
        #     if "encoder.layer.11" in name:
        #         param.requires_grad = True

    def forward(self, x):
        outputs = self.base_model(x)
        return outputs

model = ViT_B16_Head()
print(model)

In [None]:
model = model.to(device)

optimizer = torch.optim.AdamW([
        {'params': [param for param in model.base_model.parameters() if param.requires_grad is False], 'lr': 1e-6, 'weight_decay': 1e-6},  # Very small learning rate
        {'params': [param for param in model.base_model.heads.parameters() if param.requires_grad is True], 'lr': 1e-4, 'weight_decay': 1e-3},  # Larger learning rate
        # {'params': [param for name, param in model.base_model.named_parameters() if 'encoder.layer.10' in name or 'encoder.layer.11' in name], 'lr': 1e-5}  # Fine-tuned layers
        # {'params': [param for name, param in model.base_model.named_parameters() if 'encoder.layer.11' in name], 'lr': 1e-4}  # Fine-tuned layers
])

loss_fn = nn.CrossEntropyLoss()

# Training the network
num_epochs = 15

In [None]:
def calculate_accuracy(loader, model):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)

            predicted = outputs.data.argmax(dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total if total > 0 else 0

In [None]:
# save loss and accuracies
train_losses = []
train_acc = []

In [None]:
images, target = next(iter(train_loader))
images.size()

In [None]:
for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    for i, (images, targets) in enumerate(train_loader):
        images = images.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()

        # forward pass
        preds = model(images)
        
        # Calculate the loss
        loss = loss_fn(preds, targets)

        # Backpropagation
        loss.backward()
        optimizer.step()

        # calculate running loss
        loss_value = loss.item()
        running_loss += loss_value

    train_loss = running_loss / len(train_loader)
    train_losses.append(train_loss)
    train_accuracy = calculate_accuracy(train_loader, model)
    train_acc.append(train_accuracy)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.7f}, Train Acc: {train_accuracy:.7f}%')


In [None]:
# Plotting the training and validation loss
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
# plt.plot(test_losses, label='Validation Loss')
plt.title('Loss Over Epochs')
plt.legend()

# Plotting the training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(train_acc, label='Train Accuracy')
# plt.plot(test_acc, label='Validation Accuracy')
plt.title('Accuracy Over Epochs')
plt.legend()

plt.show()

In [None]:
def save_model(model, folder="models", base_filename="ViT_model.pth"):
    os.makedirs(folder, exist_ok=True)  # Create folder if it doesn't exist
    filepath = os.path.join(folder, base_filename)

    # Check if file exists and iterate
    if os.path.exists(filepath):
        i = 1
        filename, ext = os.path.splitext(base_filename)
        while os.path.exists(os.path.join(folder, f"{filename}_{i}{ext}")):
            i += 1
        filepath = os.path.join(folder, f"{filename}_{i}{ext}")

    torch.save(model.state_dict(), filepath)
    print(f"Model saved at: {filepath}")
    return filepath

save_model(model)

In [None]:
from PIL import Image
import pandas as pd

In [None]:
# def testset_predictions(model, root_dir, transform):

#     model.eval()

#     preds = []

#     # Get length of testset
#     num_images = len(os.listdir(root_dir))

#     for idx in range(num_images):
#         # get the image path
#         img_path = os.path.join(root_dir, f'{idx}.jpg')

#         # open the image with the pillow library
#         image = Image.open(img_path)

#         # Transform if necessary
#         if transform:
#             image = transform(image)

#         # add a dimension for batch
#         image = torch.unsqueeze(image, 0)

#         output = model(image.to(device))

#         _, predicted_idx = torch.max(output.cpu().data, 1)

#         preds.append([f'{idx}.jpg', predicted_idx.item()])

#     # Once all predictions are collected, create the DataFrame
#     df_preds = pd.DataFrame(preds, columns=['ID', 'Predicted_Label'])

#     return df_preds

def testset_predictions(model, root_dir, transform):
    model.eval()
    preds = []

    # Get sorted list of image filenames
    image_files = sorted([f for f in os.listdir(root_dir) if f.endswith('.jpg')])

    with torch.no_grad():  # Ensure inference is done without gradient tracking
        for img_name in image_files:
            img_path = os.path.join(root_dir, img_name)

            # Open image and ensure it has 3 channels (RGB)
            image = Image.open(img_path).convert('RGB')

            # Apply transform if provided
            if transform:
                image = transform(image)
            else:
                image = transforms.ToTensor()(image)  # Convert to tensor if no transform

            # Add batch dimension
            image = image.unsqueeze(0).to(device)

            # Get model prediction
            output = model(image)
            _, predicted_idx = torch.max(output.cpu().data, 1)

            preds.append([img_name, predicted_idx.item()])

    # Convert to DataFrame
    df_preds = pd.DataFrame(preds, columns=['ID', 'Predicted_Label'])
    
    return df_preds


In [None]:
test_path = "./data/test/"

# composite the transforms
# test_transform = transforms.Compose([
#     transforms.Grayscale(num_output_channels=3),
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),

#     # this should theoretically be wtvr the pretrained model was trained on
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=(0.229, 0.224, 0.225))
# ])

test_transform = transforms.Compose([
    # transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),                
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Ensure same normalization as pretraining
])


df_preds = testset_predictions(model, test_path, test_transform)

In [None]:
df_preds

In [None]:
df_preds.to_csv('ViT_test_submission.csv', index=False)