In [None]:
import torch
import numpy as np
import random
import copy

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
import os

zip_path = "/content/drive/MyDrive/splitted_dataset_DL_project.zip"
extract_path = "/content/splitted_dataset_DL_project"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction complete.")

Extraction complete.


In [None]:
import os
import pandas as pd

dataset_root = "/content/splitted_dataset_DL_project/splitted_dataset"

image_paths = []
labels = []
splits = []

for class_name in os.listdir(dataset_root):
    class_path = os.path.join(dataset_root, class_name)

    if not os.path.isdir(class_path):
        continue

    for split_name in ["train", "test", "validation"]:
        split_path = os.path.join(class_path, split_name)

        if not os.path.isdir(split_path):
            continue

        for root, _, files in os.walk(split_path):
            for f in files:
                if f.lower().endswith(('.jpg', '.png', '.jpeg')):
                    full_path = os.path.join(root, f)

                    image_paths.append(full_path)
                    labels.append(class_name)
                    splits.append(split_name)

df = pd.DataFrame({
    "path": image_paths,
    "label": labels,
    "split": splits
})

print("Dataset loaded:")
print(df.head())
print("Total images:", len(df))

output_path = "/content/drive/MyDrive/dataset_index.csv"
df.to_csv(output_path, index=False)

print("Saved to:", output_path)
df.groupby(["label", "split"]).size()

Dataset loaded:
                                                path                 label  \
0  /content/splitted_dataset_DL_project/splitted_...  Potato___Late_blight   
1  /content/splitted_dataset_DL_project/splitted_...  Potato___Late_blight   
2  /content/splitted_dataset_DL_project/splitted_...  Potato___Late_blight   
3  /content/splitted_dataset_DL_project/splitted_...  Potato___Late_blight   
4  /content/splitted_dataset_DL_project/splitted_...  Potato___Late_blight   

   split  
0  train  
1  train  
2  train  
3  train  
4  train  
Total images: 30173
Saved to: /content/drive/MyDrive/dataset_index.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,0
label,split,Unnamed: 2_level_1
Pepper__bell___Bacterial_spot,test,277
Pepper__bell___Bacterial_spot,train,907
Pepper__bell___Bacterial_spot,validation,278
Pepper__bell___healthy,test,406
Pepper__bell___healthy,train,1330
Pepper__bell___healthy,validation,399
Potato___Early_blight,test,278
Potato___Early_blight,train,913
Potato___Early_blight,validation,274
Potato___Late_blight,test,271


In [None]:
!pip install transformers datasets timm -q

In [None]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import ViTForImageClassification, AutoImageProcessor
from tqdm import tqdm

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/dataset_index.csv")
df.head()

Unnamed: 0,path,label,split
0,/content/splitted_dataset_DL_project/splitted_...,Potato___Late_blight,train
1,/content/splitted_dataset_DL_project/splitted_...,Potato___Late_blight,train
2,/content/splitted_dataset_DL_project/splitted_...,Potato___Late_blight,train
3,/content/splitted_dataset_DL_project/splitted_...,Potato___Late_blight,train
4,/content/splitted_dataset_DL_project/splitted_...,Potato___Late_blight,train


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])

num_classes = df["label_id"].nunique()
num_classes

15

In [None]:
train_df = df[df["split"] == "train"].reset_index(drop=True)
val_df   = df[df["split"] == "validation"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

print(len(train_df), len(val_df), len(test_df))

18763 5699 5711


In [None]:
model_name = "google/vit-base-patch16-224-in21k"
processor = AutoImageProcessor.from_pretrained(model_name)

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [None]:
from torch.utils.data import Dataset
from PIL import Image
import torch

class VitImageDataset(Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row["path"]).convert("RGB")

        processed = self.processor(images=img, return_tensors="pt")

        return {
            "pixel_values": processed["pixel_values"].squeeze(0),
            "labels": torch.tensor(row["label_id"], dtype=torch.long)
        }

In [None]:
from torch.utils.data import DataLoader

train_dataset = VitImageDataset(train_df, processor)
val_dataset   = VitImageDataset(val_df, processor)
test_dataset  = VitImageDataset(test_df, processor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
base_model = ViTForImageClassification.from_pretrained(model_name, num_labels=num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
base_model.to(device)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

In [None]:
def freeze_all(model):
    for param in model.parameters():
        param.requires_grad = False

    for param in model.classifier.parameters():
        param.requires_grad = True


def unfreeze_last_n_blocks(model, n):
    blocks = model.vit.encoder.layer
    for param in model.parameters():
        param.requires_grad = False  # freeze all first
    for block in blocks[-n:]:         # unfreeze last n blocks
        for param in block.parameters():
            param.requires_grad = True
    # always train classification head
    for param in model.classifier.parameters():
        param.requires_grad = True

def unfreeze_all(model):
    for param in model.parameters():
        param.requires_grad = True


In [None]:
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(loader):
        optimizer.zero_grad()
        inputs = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(pixel_values=inputs)
        loss = criterion(outputs.logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = outputs.logits.max(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(loader), correct / total


def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in loader:
            inputs = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(pixel_values=inputs)
            loss = criterion(outputs.logits, labels)

            total_loss += loss.item()
            _, preds = outputs.logits.max(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return total_loss / len(loader), correct / total


def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=5):
    for epoch in range(epochs):
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_acc = evaluate(model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train: loss={train_loss:.4f}, acc={train_acc:.4f}")
        print(f"Valid: loss={val_loss:.4f}, acc={val_acc:.4f}\n")

Feature Extraction

In [None]:
model = copy.deepcopy(base_model).to(device)

freeze_all(model)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

print("Running FEATURE EXTRACTION...")
train_model(model, train_loader, val_loader, optimizer, criterion, epochs=5)


Running FEATURE EXTRACTION...


100%|██████████| 1173/1173 [04:37<00:00,  4.22it/s]


Epoch 1/5
Train: loss=0.6864, acc=0.8397
Valid: loss=0.3242, acc=0.9254



100%|██████████| 1173/1173 [04:15<00:00,  4.58it/s]


Epoch 2/5
Train: loss=0.2609, acc=0.9390
Valid: loss=0.2073, acc=0.9556



100%|██████████| 1173/1173 [04:16<00:00,  4.58it/s]


Epoch 3/5
Train: loss=0.1800, acc=0.9585
Valid: loss=0.1569, acc=0.9621



100%|██████████| 1173/1173 [04:15<00:00,  4.58it/s]


Epoch 4/5
Train: loss=0.1402, acc=0.9680
Valid: loss=0.1283, acc=0.9712



100%|██████████| 1173/1173 [04:16<00:00,  4.57it/s]


Epoch 5/5
Train: loss=0.1145, acc=0.9743
Valid: loss=0.1058, acc=0.9784



Partial fine-tuning

In [None]:
model = base_model
unfreeze_last_n_blocks(model, n=4)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

print("Running PARTIAL FINE-TUNING...")
train_model(model, train_loader, val_loader, optimizer, criterion, epochs=5)

Running PARTIAL FINE-TUNING...


100%|██████████| 1173/1173 [06:34<00:00,  2.97it/s]


Epoch 1/5
Train: loss=0.3300, acc=0.9462
Valid: loss=0.0415, acc=0.9942



100%|██████████| 1173/1173 [06:34<00:00,  2.97it/s]


Epoch 2/5
Train: loss=0.0366, acc=0.9934
Valid: loss=0.0576, acc=0.9846



100%|██████████| 1173/1173 [06:34<00:00,  2.97it/s]


Epoch 3/5
Train: loss=0.0222, acc=0.9951
Valid: loss=0.0753, acc=0.9791



100%|██████████| 1173/1173 [06:35<00:00,  2.97it/s]


Epoch 4/5
Train: loss=0.0161, acc=0.9959
Valid: loss=0.0065, acc=0.9986



100%|██████████| 1173/1173 [06:34<00:00,  2.97it/s]


Epoch 5/5
Train: loss=0.0119, acc=0.9974
Valid: loss=0.0059, acc=0.9989



Full fine-tuning

In [None]:
model = base_model
unfreeze_all(model)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

print("Running FULL FINE-TUNING...")
train_model(model, train_loader, val_loader, optimizer, criterion, epochs=5)


Running FULL FINE-TUNING...


100%|██████████| 1173/1173 [11:16<00:00,  1.73it/s]


Epoch 1/5
Train: loss=0.0108, acc=0.9972
Valid: loss=0.0076, acc=0.9981



100%|██████████| 1173/1173 [11:16<00:00,  1.73it/s]


Epoch 2/5
Train: loss=0.0031, acc=0.9994
Valid: loss=0.0033, acc=0.9991



 43%|████▎     | 510/1173 [04:53<06:23,  1.73it/s]