<a href="https://colab.research.google.com/github/dhyaneswaran-thilagar/AI-CIA/blob/main/Image_and_Video_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
vishaldbs_supervised_path = kagglehub.dataset_download('vishaldbs/supervised')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/vishaldbs/supervised?dataset_version_number=1...


100%|██████████| 19.9G/19.9G [03:59<00:00, 89.0MB/s]

Extracting files...





Data source import complete.


In [3]:
print(vishaldbs_supervised_path)


/root/.cache/kagglehub/datasets/vishaldbs/supervised/versions/1


In [7]:
self.rgb_dir = os.path.join(root_dir, split, "rgb")
self.nir_dir = os.path.join(root_dir, split, "nir")
self.mask_dir = os.path.join(root_dir, split, "masks")


NameError: name 'root_dir' is not defined

In [14]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import logging
from PIL import Image
import numpy as np
import random
import time

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# One-hot encoding function
def _to_one_hot(mask, num_classes):
    one_hot = torch.zeros((num_classes, mask.shape[0], mask.shape[1]), dtype=torch.float32)
    for i in range(num_classes):
        one_hot[i] = (mask == i).float()
    return one_hot

# Dataset
class AgricultureVisionDataset(Dataset):
    def __init__(self, root_dir, split="train", subset_size=1000, transform=None):
        self.root_dir = root_dir
        self.split = split
        self.subset_size = subset_size
        self.transform = transform
        self.num_classes = 9

        self.rgb_dir = os.path.join(root_dir, split, "images", "rgb")
        self.nir_dir = os.path.join(root_dir, split, "images", "nir")
        self.mask_dir = os.path.join(root_dir, split, "masks")

        self.rgb_images = sorted([f for f in os.listdir(self.rgb_dir) if f.endswith(".png")])
        if subset_size and len(self.rgb_images) > subset_size:
            self.rgb_images = random.sample(self.rgb_images, subset_size)

    def __len__(self):
        return len(self.rgb_images)

    def __getitem__(self, idx):
        base_name = os.path.splitext(self.rgb_images[idx])[0]
        rgb_path = os.path.join(self.rgb_dir, self.rgb_images[idx])
        nir_path = os.path.join(self.nir_dir, f"{base_name}.jpg")
        mask_path = os.path.join(self.mask_dir, f"{base_name}.png")

        rgb_image = Image.open(rgb_path).convert("RGB")
        nir_image = Image.open(nir_path).convert("L")
        mask = Image.open(mask_path)

        rgb_array = np.array(rgb_image)
        nir_array = np.array(nir_image)
        mask_array = np.array(mask)

        image = np.concatenate([rgb_array, nir_array[..., np.newaxis]], axis=-1)
        image = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
        mask_one_hot = _to_one_hot(torch.from_numpy(mask_array).long(), self.num_classes)

        return {"image": image, "mask": mask_one_hot}

# UNet blocks
def double_conv(in_channels, out_channels):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
        nn.ReLU(inplace=True),
    )

class UNet(nn.Module):
    def __init__(self, n_channels, n_classes):
        super(UNet, self).__init__()
        self.inc = double_conv(n_channels, 64)
        self.down1 = nn.Sequential(nn.MaxPool2d(2), double_conv(64, 128))
        self.down2 = nn.Sequential(nn.MaxPool2d(2), double_conv(128, 256))
        self.down3 = nn.Sequential(nn.MaxPool2d(2), double_conv(256, 512))
        self.down4 = nn.Sequential(nn.MaxPool2d(2), double_conv(512, 1024))

        self.up1 = nn.ConvTranspose2d(1024, 512, kernel_size=2, stride=2)
        self.conv1 = double_conv(1024, 512)
        self.up2 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
        self.conv2 = double_conv(512, 256)
        self.up3 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.conv3 = double_conv(256, 128)
        self.up4 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.conv4 = double_conv(128, 64)

        self.outc = nn.Conv2d(64, n_classes, kernel_size=1)

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5)
        x = self.conv1(torch.cat([x, x4], dim=1))
        x = self.up2(x)
        x = self.conv2(torch.cat([x, x3], dim=1))
        x = self.up3(x)
        x = self.conv3(torch.cat([x, x2], dim=1))
        x = self.up4(x)
        x = self.conv4(torch.cat([x, x1], dim=1))
        return self.outc(x)

# mIoU calculation
def calculate_miou(outputs, targets):
    outputs = torch.argmax(outputs, dim=1)
    targets = torch.argmax(targets, dim=1)
    ious = []
    for cls in range(outputs.max().item() + 1):
        intersection = ((outputs == cls) & (targets == cls)).float().sum()
        union = ((outputs == cls) | (targets == cls)).float().sum()
        iou = (intersection + 1e-6) / (union + 1e-6)
        ious.append(iou.item())
    return np.mean(ious)

# Data paths
data_dir = "/root/.cache/kagglehub/datasets/vishaldbs/supervised/versions/1"

# Datasets
train_dataset = AgricultureVisionDataset(data_dir, split="train", subset_size=1000)
val_dataset = AgricultureVisionDataset(data_dir, split="val", subset_size=1000)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2)

# Model, optimizer, loss
model = UNet(n_channels=4, n_classes=9).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3)

# AMP only if CUDA is available
use_amp = torch.cuda.is_available()
if use_amp:
    from torch.amp import autocast, GradScaler
    scaler = GradScaler(device_type="cuda")

# Training loop
epochs = 1
best_val_miou = 0.0

for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    epoch_miou = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        images = batch["image"].to(device)
        masks = batch["mask"].to(device)

        optimizer.zero_grad()

        if use_amp:
            with autocast(device_type="cuda"):
                outputs = model(images)
                loss = criterion(outputs, masks)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(images)
            loss = criterion(outputs, masks)
            loss.backward()
            optimizer.step()

        epoch_loss += loss.item()
        epoch_miou += calculate_miou(outputs.detach(), masks.detach())

    epoch_loss /= len(train_loader)
    epoch_miou /= len(train_loader)

    logger.info(f"Epoch {epoch+1}: Train Loss={epoch_loss:.4f}, mIoU={epoch_miou:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    val_miou = 0.0

    with torch.no_grad():
        for batch in val_loader:
            images = batch["image"].to(device)
            masks = batch["mask"].to(device)
            outputs = model(images)
            loss = criterion(outputs, masks)
            val_loss += loss.item()
            val_miou += calculate_miou(outputs, masks)

    val_loss /= len(val_loader)
    val_miou /= len(val_loader)
    logger.info(f"Validation Loss={val_loss:.4f}, mIoU={val_miou:.4f}")

    scheduler.step(val_loss)

    if val_miou > best_val_miou:
        best_val_miou = val_miou
        torch.save(model.state_dict(), "best_model.pth")
        logger.info(f"New best model saved with mIoU: {best_val_miou:.4f}")


FileNotFoundError: [Errno 2] No such file or directory: '/root/.cache/kagglehub/datasets/vishaldbs/supervised/versions/1/train/images/rgb'

In [11]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import logging
from PIL import Image
import numpy as np
import random

# Logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# One-hot encoding function
def _to_one_hot(mask, num_classes):
    one_hot = torch.zeros((num_classes, mask.shape[0], mask.shape[1]), dtype=torch.float32)
    for i in range(num_classes):
        one_hot[i] = (mask == i).float()
    return one_hot

# Dataset class (with corrected paths)
class AgricultureVisionDataset(Dataset):
    def __init__(self, root_dir, split="train", subset_size=1000, transform=None):
        self.root_dir = root_dir
        self.split = split
        self.subset_size = subset_size
        self.transform = transform
        self.num_classes = 9

        self.rgb_dir = os.path.join(root_dir, split, "rgb")
        self.nir_dir = os.path.join(root_dir, split, "nir")
        self.mask_dir = os.path.join(root_dir, split, "masks")

        self.rgb_images = sorted([f for f in os.listdir(self.rgb_dir) if f.endswith(".jpg")])
        if subset_size and len(self.rgb_images) > subset_size:
            self.rgb_images = random.sample(self.rgb_images, subset_size)

    def __len__(self):
        return len(self.rgb_images)

    def __getitem__(self, idx):
        base_name = os.path.splitext(self.rgb_images[idx])[0]
        rgb_path = os.path.join(self.rgb_dir, self.rgb_images[idx])
        nir_path = os.path.join(self.nir_dir, f"{base_name}.jpg")
        mask_path = os.path.join(self.mask_dir, f"{base_name}.png")

        rgb_image = Image.open(rgb_path).convert("RGB")
        nir_image = Image.open(nir_path).convert("L")
        mask = Image.open(mask_path)

        rgb_array = np.array(rgb_image)
        nir_array = np.array(nir_image)
        mask_array = np.array(mask)

        image = np.concatenate([rgb_array, nir_array[..., np.newaxis]], axis=-1)
        image = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
        mask_one_hot = _to_one_hot(torch.from_numpy(mask_array).long(), self.num_classes)

        return {"image": image, "mask": mask_one_hot}

# UNet model
def double_conv(in_channels, out_channels):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
        nn.ReLU(inplace=True),
    )

class UNet(nn.Module):
    def __init__(self, n_channels, n_classes):
        super(UNet, self).__init__()
        self.inc = double_conv(n_channels, 64)
        self.down1 = nn.Sequential(nn.MaxPool2d(2), double_conv(64, 128))
        self.down2 = nn.Sequential(nn.MaxPool2d(2), double_conv(128, 256))
        self.down3 = nn.Sequential(nn.MaxPool2d(2), double_conv(256, 512))
        self.down4 = nn.Sequential(nn.MaxPool2d(2), double_conv(512, 1024))

        self.up1 = nn.ConvTranspose2d(1024, 512, kernel_size=2, stride=2)
        self.conv1 = double_conv(1024, 512)
        self.up2 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
        self.conv2 = double_conv(512, 256)
        self.up3 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.conv3 = double_conv(256, 128)
        self.up4 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.conv4 = double_conv(128, 64)

        self.outc = nn.Conv2d(64, n_classes, kernel_size=1)

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5)
        x = self.conv1(torch.cat([x, x4], dim=1))
        x = self.up2(x)
        x = self.conv2(torch.cat([x, x3], dim=1))
        x = self.up3(x)
        x = self.conv3(torch.cat([x, x2], dim=1))
        x = self.up4(x)
        x = self.conv4(torch.cat([x, x1], dim=1))
        return self.outc(x)

# mIoU metric
def calculate_miou(outputs, targets):
    outputs = torch.argmax(outputs, dim=1)
    targets = torch.argmax(targets, dim=1)
    ious = []
    for cls in range(outputs.max().item() + 1):
        intersection = ((outputs == cls) & (targets == cls)).float().sum()
        union = ((outputs == cls) | (targets == cls)).float().sum()
        iou = (intersection + 1e-6) / (union + 1e-6)
        ious.append(iou.item())
    return np.mean(ious)

# ✅ Set correct dataset path from KaggleHub
data_dir = "/root/.cache/kagglehub/datasets/vishaldbs/supervised/versions/1"

# Load datasets
train_dataset = AgricultureVisionDataset(data_dir, split="train", subset_size=1000)
val_dataset = AgricultureVisionDataset(data_dir, split="val", subset_size=1000)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2)

# Model setup
model = UNet(n_channels=4, n_classes=9).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3)

# AMP setup
use_amp = torch.cuda.is_available()
if use_amp:
    from torch.amp import autocast, GradScaler
    scaler = GradScaler(device_type="cuda")

# Training loop
epochs = 1
best_val_miou = 0.0

for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    epoch_miou = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        images = batch["image"].to(device)
        masks = batch["mask"].to(device)
        optimizer.zero_grad()

        if use_amp:
            with autocast(device_type="cuda"):
                outputs = model(images)
                loss = criterion(outputs, masks)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(images)
            loss = criterion(outputs, masks)
            loss.backward()
            optimizer.step()

        epoch_loss += loss.item()
        epoch_miou += calculate_miou(outputs.detach(), masks.detach())

    epoch_loss /= len(train_loader)
    epoch_miou /= len(train_loader)
    logger.info(f"Epoch {epoch+1}: Train Loss={epoch_loss:.4f}, mIoU={epoch_miou:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    val_miou = 0.0
    with torch.no_grad():
        for batch in val_loader:
            images = batch["image"].to(device)
            masks = batch["mask"].to(device)
            outputs = model(images)
            loss = criterion(outputs, masks)
            val_loss += loss.item()
            val_miou += calculate_miou(outputs, masks)

    val_loss /= len(val_loader)
    val_miou /= len(val_loader)
    logger.info(f"Validation Loss={val_loss:.4f}, mIoU={val_miou:.4f}")

    scheduler.step(val_loss)

    if val_miou > best_val_miou:
        best_val_miou = val_miou
        torch.save(model.state_dict(), "best_model.pth")
        logger.info(f"New best model saved with mIoU: {best_val_miou:.4f}")


FileNotFoundError: [Errno 2] No such file or directory: '/root/.cache/kagglehub/datasets/vishaldbs/supervised/versions/1/train/rgb'

In [None]:
import os
import random
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# One-hot encoding
def _to_one_hot(mask, num_classes):
    one_hot = torch.zeros((num_classes, mask.shape[0], mask.shape[1]), dtype=torch.float32)
    for i in range(num_classes):
        one_hot[i] = (mask == i).float()
    return one_hot

# Dataset
class AgricultureVisionDataset(Dataset):
    def __init__(self, root_dir, split="train", subset_size=1000):
        self.root_dir = root_dir
        self.split = split
        self.num_classes = 9

        self.rgb_dir = os.path.join(root_dir, split, "images", "rgb")
        self.nir_dir = os.path.join(root_dir, split, "images", "nir")
        self.mask_dir = os.path.join(root_dir, split, "masks")

        self.rgb_images = sorted([f for f in os.listdir(self.rgb_dir) if f.endswith(".jpg")])
        if subset_size and len(self.rgb_images) > subset_size:
            self.rgb_images = random.sample(self.rgb_images, subset_size)

    def __len__(self):
        return len(self.rgb_images)

    def __getitem__(self, idx):
        base_name = os.path.splitext(self.rgb_images[idx])[0]
        rgb_path = os.path.join(self.rgb_dir, f"{base_name}.jpg")
        nir_path = os.path.join(self.nir_dir, f"{base_name}.jpg")
        mask_path = os.path.join(self.mask_dir, f"{base_name}.png")

        rgb = Image.open(rgb_path).convert("RGB")
        nir = Image.open(nir_path).convert("L")
        mask = Image.open(mask_path)

        rgb_np = np.array(rgb)
        nir_np = np.expand_dims(np.array(nir), axis=-1)
        image = np.concatenate([rgb_np, nir_np], axis=-1)

        mask_np = np.array(mask)
        image_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
        mask_tensor = _to_one_hot(torch.from_numpy(mask_np).long(), self.num_classes)

        return {"image": image_tensor, "mask": mask_tensor}

# UNet definition
def double_conv(in_c, out_c):
    return nn.Sequential(
        nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
        nn.ReLU(inplace=True),
        nn.Conv2d(out_c, out_c, kernel_size=3, padding=1),
        nn.ReLU(inplace=True),
    )

class UNet(nn.Module):
    def __init__(self, n_channels, n_classes):
        super(UNet, self).__init__()
        self.inc = double_conv(n_channels, 64)
        self.down1 = nn.Sequential(nn.MaxPool2d(2), double_conv(64, 128))
        self.down2 = nn.Sequential(nn.MaxPool2d(2), double_conv(128, 256))
        self.down3 = nn.Sequential(nn.MaxPool2d(2), double_conv(256, 512))
        self.down4 = nn.Sequential(nn.MaxPool2d(2), double_conv(512, 1024))
        self.up1 = nn.ConvTranspose2d(1024, 512, kernel_size=2, stride=2)
        self.conv1 = double_conv(1024, 512)
        self.up2 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
        self.conv2 = double_conv(512, 256)
        self.up3 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.conv3 = double_conv(256, 128)
        self.up4 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.conv4 = double_conv(128, 64)
        self.outc = nn.Conv2d(64, n_classes, kernel_size=1)

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5)
        x = self.conv1(torch.cat([x, x4], dim=1))
        x = self.up2(x)
        x = self.conv2(torch.cat([x, x3], dim=1))
        x = self.up3(x)
        x = self.conv3(torch.cat([x, x2], dim=1))
        x = self.up4(x)
        x = self.conv4(torch.cat([x, x1], dim=1))
        return self.outc(x)

# mIoU
def calculate_miou(outputs, targets):
    outputs = torch.argmax(outputs, dim=1)
    targets = torch.argmax(targets, dim=1)
    ious = []
    for cls in range(outputs.max().item() + 1):
        inter = ((outputs == cls) & (targets == cls)).float().sum()
        union = ((outputs == cls) | (targets == cls)).float().sum()
        iou = (inter + 1e-6) / (union + 1e-6)
        ious.append(iou.item())
    return np.mean(ious)

# Data directory
data_dir = "/root/.cache/kagglehub/datasets/vishaldbs/supervised/versions/1/Agriculture-Vision-2021"

# Load data
train_dataset = AgricultureVisionDataset(data_dir, "train", subset_size=500)
val_dataset = AgricultureVisionDataset(data_dir, "val", subset_size=200)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Model and training setup
model = UNet(n_channels=4, n_classes=9).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=3)

# Train
epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_miou = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        images = batch["image"].to(device)
        masks = batch["mask"].to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, masks)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_miou += calculate_miou(outputs.detach(), masks.detach())

    print(f"[Epoch {epoch+1}] Train Loss: {total_loss/len(train_loader):.4f} | mIoU: {total_miou/len(train_loader):.4f}")


Using device: cpu


Epoch 1:  73%|███████▎  | 46/63 [2:54:32<1:03:42, 224.83s/it]