## Run Baseline ResNet-50 

TASK: Detect Insect Type \
Dataset: iNaturalist \
Model: Resnet-50 (pretrained)

In [1]:
# pip install datasets
# pip install scikit-learn

In [3]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split, KFold
import os
from PIL import Image
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
import peft
from peft import LoraConfig, get_peft_model, PeftModel
from utils.label_mappings import iNat_to_clean_map

2026-01-25 07:23:34.886840: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-25 07:23:34.886939: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-25 07:23:34.931696: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-25 07:23:35.027486: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Data Cleaning

In [2]:
# https://huggingface.co/datasets/sxj1215/inaturalist
dataset = load_dataset("sxj1215/inaturalist", split="train")

In [5]:
os.makedirs("images", exist_ok=True)

rows = []
img_id = 0

for row in dataset:
    original_label = row["messages"][1]["content"]

    if original_label not in iNat_to_clean_map:
        continue  # skip unmapped species (?), should there be a different OTHER class

    clean_label = iNat_to_clean_map.get(original_label, original_label)

    image_path = f"images/{img_id}.jpg"
    row["images"][0].save(image_path)

    rows.append(f"{image_path},{clean_label}")
    img_id += 1

OSError: image file is truncated (122 bytes not processed)

In [6]:
with open("labels.csv", "w") as f:
    f.write("image,label\n")
    f.write("\n".join(rows))

Custom Dataset Transformations

In [8]:
# uses RandomResizedCrop, RandomHorizontalFlip to help model generalize
# applies standard ImageNet normalization

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

val_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


In [3]:
# custom PyTorch Dataset class to load images and convert labels to integers

class JPGDataset(Dataset):
    def __init__(self, data_df, transform=None):
        self.data = data_df  # now accepts a df directly
        self.transform = transform

        # map label strings to integers
        self.classes = sorted(self.data.label.unique())
        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image = Image.open(row.image).convert("RGB")
        label = self.class_to_idx[row.label]
        if self.transform:
            image = self.transform(image)
        return image, label


In [4]:
def train_one_epoch(model, loader):
    model.train()
    correct, total = 0, 0

    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        correct += (outputs.argmax(1) == labels).sum().item()
        total += labels.size(0)

    return correct / total

In [5]:
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)

    return correct / total

### Model Architecture/Training (random sample, cross-val)

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
df = pd.read_csv("labels.csv")
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"\n--- Starting Fold {fold + 1}/5 ---")
    
    # create df for eacg fold
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]
    
    # initialize datasets and loaders
    train_dataset = JPGDataset(train_df, transform=train_transforms)
    val_dataset   = JPGDataset(val_df, transform=val_transforms)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)
    
    num_classes = len(train_dataset.classes)

    # re-initialize the model for every fold to prevent weight leakage
    model = models.resnet50(weights="IMAGENET1K_V1")
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    model = model.to(device)

    # freeze backbone weights and unfreeze the new head
    for param in model.parameters():
        param.requires_grad = False
    for param in model.fc.parameters():
        param.requires_grad = True
        
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.fc.parameters(), lr=1e-3)

    # train for 2 epochs per fold
    for epoch in range(2):
        train_acc = train_one_epoch(model, train_loader)
        val_acc = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}: train={train_acc:.3f}, val={val_acc:.3f}")
    
    fold_accuracies.append(val_acc)



--- Starting Fold 1/5 ---


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/dsenthil/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 198MB/s] 


Epoch 1: train=0.736, val=0.940
Epoch 2: train=0.909, val=0.935

--- Starting Fold 2/5 ---
Epoch 1: train=0.715, val=0.919
Epoch 2: train=0.897, val=0.927

--- Starting Fold 3/5 ---
Epoch 1: train=0.740, val=0.952
Epoch 2: train=0.892, val=0.940

--- Starting Fold 4/5 ---
Epoch 1: train=0.705, val=0.895
Epoch 2: train=0.904, val=0.935

--- Starting Fold 5/5 ---
Epoch 1: train=0.761, val=0.960
Epoch 2: train=0.885, val=0.951


In [12]:
print(f"\nAverage 5-Fold Accuracy: {np.mean(fold_accuracies):.3f}")


Average 5-Fold Accuracy: 0.938


In [13]:
# save the final weights and class names
torch.save({
    "model": model.state_dict(),
    "classes": train_dataset.classes
}, "resnet50_insects_cv.pth")

### LoRA Optimized Experiment

In [6]:
df = pd.read_csv("labels.csv")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lora_config = LoraConfig(
    r=16, 
    lora_alpha=32,
    target_modules=["conv1"],
    lora_dropout=0.1,
    modules_to_save=["fc"] 
)

In [9]:
# 5-fold cross val loop

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"\n--- Starting Fold {fold + 1}/5 ---")
    
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]
    
    train_dataset = JPGDataset(train_df, transform=train_transforms)
    val_dataset = JPGDataset(val_df, transform=val_transforms)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    base_model = models.resnet50(weights="IMAGENET1K_V1")
    base_model.fc = nn.Linear(base_model.fc.in_features, len(train_dataset.classes)) 
    model = get_peft_model(base_model, lora_config).to(device)
    
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-4)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(2):
        train_acc = train_one_epoch(model, train_loader)
        val_acc = evaluate(model, val_loader)
        print(f"Fold {fold+1}, Epoch {epoch+1}: Train Acc={train_acc:.3f}, Val Acc={val_acc:.3f}")

    fold_results.append(val_acc)


--- Starting Fold 1/5 ---
Fold 1, Epoch 1: Train Acc=0.721, Val Acc=0.895
Fold 1, Epoch 2: Train Acc=0.914, Val Acc=0.968

--- Starting Fold 2/5 ---
Fold 2, Epoch 1: Train Acc=0.726, Val Acc=0.931
Fold 2, Epoch 2: Train Acc=0.921, Val Acc=0.972

--- Starting Fold 3/5 ---
Fold 3, Epoch 1: Train Acc=0.732, Val Acc=0.931
Fold 3, Epoch 2: Train Acc=0.910, Val Acc=0.980

--- Starting Fold 4/5 ---
Fold 4, Epoch 1: Train Acc=0.718, Val Acc=0.895
Fold 4, Epoch 2: Train Acc=0.899, Val Acc=0.976

--- Starting Fold 5/5 ---
Fold 5, Epoch 1: Train Acc=0.748, Val Acc=0.968
Fold 5, Epoch 2: Train Acc=0.918, Val Acc=0.976


In [10]:
print(f"\nFinal 5-Fold Mean Accuracy: {np.mean(fold_results):.4f}")


Final 5-Fold Mean Accuracy: 0.9742


### Model Architecture/Training (random sample, no cross-val)

In [None]:
df = pd.read_csv("labels.csv")

train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df.label,
    random_state=42
)

train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)


In [None]:
train_dataset = JPGDataset("train.csv", transform=train_transforms)
val_dataset   = JPGDataset("val.csv", transform=val_transforms)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)

num_classes = len(train_dataset.classes)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# load the pre-trained resnet50 model
model = models.resnet50(weights="IMAGENET1K_V1")

# replace the final layer with new layer with the number of insect classes
model.fc = nn.Linear(model.fc.in_features, num_classes)
model = model.to(device)

In [None]:
# the backbone weights of the ResNet are frozen
for param in model.parameters():
    param.requires_grad = False

# only the new final layer will be updated during the training (fine-tuning step 1)
for param in model.fc.parameters():
    param.requires_grad = True

In [None]:
# model trained using Adam optimizer and CrossEntropyLoss

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(
    model.fc.parameters(),
    lr=1e-3
)

In [16]:
epochs = 2

for epoch in range(epochs):
    train_acc = train_one_epoch(model, train_loader)
    val_acc = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}: train={train_acc:.3f}, val={val_acc:.3f}")


Epoch 1: train=0.767, val=0.927
Epoch 2: train=0.891, val=0.956


In [None]:
# save the final weights and class names

torch.save({
    "model": model.state_dict(),
    "classes": train_dataset.classes
}, "resnet50_insects.pth")
