In [None]:
import os
import torch
from torch import nn, optim
from torchvision import datasets, transforms
from transformers import ViTFeatureExtractor, ViTForImageClassification
from torch.utils.data import DataLoader

from google.colab import drive
drive.mount("/content/drive")

train_dir = '/content/drive/MyDrive/hackathon/datasets/train'
test_dir = '/content/drive/MyDrive/hackathon/datasets/test'

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = datasets.ImageFolder(train_dir, transform=transform)
test_dataset = datasets.ImageFolder(test_dir, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=2,
    ignore_mismatched_sizes=True
)
model.classifier = nn.Linear(model.config.hidden_size, 2)
model.to('cuda')

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

epochs = 15
model.train()

for epoch in range(epochs):
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to('cuda'), labels.to('cuda')

        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}")

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model(images).logits
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/15, Loss: 0.1922098556323383
Epoch 2/15, Loss: 0.03783556456236463
Epoch 3/15, Loss: 0.01235702453370213
Epoch 4/15, Loss: 0.005758738086586888
Epoch 5/15, Loss: 0.002793970336188751
Epoch 6/15, Loss: 0.001683603021179683
Epoch 7/15, Loss: 0.0011463491307375463
Epoch 8/15, Loss: 0.0008383270384152105
Epoch 9/15, Loss: 0.0006363530955329901
Epoch 10/15, Loss: 0.0004950884062861604
Epoch 11/15, Loss: 0.0003938096991426809
Epoch 12/15, Loss: 0.0003194870488303258
Epoch 13/15, Loss: 0.00026273339575627483
Epoch 14/15, Loss: 0.0002192551924635243
Epoch 15/15, Loss: 0.0001849282312025736
Test Accuracy: 96.00%


In [None]:
model_save_path = "/content/drive/MyDrive/hackathon/fine_tuned_vit_hotdog_not_hotdog.pth"
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to /content/drive/MyDrive/hackathon/fine_tuned_vit_hotdog_not_hotdog.pth


# Continue training

In [None]:
import os
import torch
from torch import nn, optim
from torchvision import datasets, transforms
from transformers import ViTFeatureExtractor, ViTForImageClassification
from torch.utils.data import DataLoader

from google.colab import drive
drive.mount("/content/drive")
model_save_path = "/content/drive/MyDrive/hackathon/fine_tuned_vit_hotdog_not_hotdog_continue_2.pth"

model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=2,
    ignore_mismatched_sizes=True
)
# Adjust the final layer
model.classifier = nn.Linear(model.config.hidden_size, 2)
model.load_state_dict(torch.load(model_save_path))
model.to('cuda')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [None]:
train_dir = '/content/drive/MyDrive/hackathon/datasets/train'
test_dir = '/content/drive/MyDrive/hackathon/datasets/test'

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to match ViT input size
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = datasets.ImageFolder(train_dir, transform=transform)
test_dataset = datasets.ImageFolder(test_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-6)

In [None]:
model.train()

additional_epochs = 10
for epoch in range(additional_epochs):
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to('cuda'), labels.to('cuda')

        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{additional_epochs}, Loss: {running_loss/len(train_loader)}")

Epoch 1/10, Loss: 5.321841530533297e-10
Epoch 2/10, Loss: 5.041744782732677e-10
Epoch 3/10, Loss: 5.041744782732677e-10
Epoch 4/10, Loss: 4.761647767810729e-10
Epoch 5/10, Loss: 4.481550786278947e-10
Epoch 6/10, Loss: 4.2014537880520816e-10
Epoch 7/10, Loss: 4.2014537880520816e-10
Epoch 8/10, Loss: 3.9213570402514624e-10
Epoch 9/10, Loss: 3.9213570402514624e-10
Epoch 10/10, Loss: 3.9213570402514624e-10


In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model(images).logits
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 96.75%


In [None]:
model_save_path = "/content/drive/MyDrive/hackathon/fine_tuned_vit_hotdog_not_hotdog_continue_3.pth"
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to /content/drive/MyDrive/hackathon/fine_tuned_vit_hotdog_not_hotdog_continue_3.pth
