In [None]:
%pip install -U datasets
%pip show fsspec

Name: fsspec
Version: 2025.3.0
Summary: File-system specification
Home-page: https://github.com/fsspec/filesystem_spec
Author: 
Author-email: 
License: BSD 3-Clause License

Copyright (c) 2018, Martin Durant
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS

In [None]:
import os
print(os.cpu_count())

2


In [None]:
from datasets import load_dataset, DatasetDict
from torchvision import transforms
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

---------------------------------------------------<br>
🟢 Load and split the dataset<br>
---------------------------------------------------

In [None]:
dataset = load_dataset(path="shrashraddha/medical_image_cleaning", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/507 [00:00<?, ?B/s]

train-00000-of-00010.parquet:   0%|          | 0.00/299M [00:00<?, ?B/s]

train-00001-of-00010.parquet:   0%|          | 0.00/478M [00:00<?, ?B/s]

train-00002-of-00010.parquet:   0%|          | 0.00/179M [00:00<?, ?B/s]

train-00003-of-00010.parquet:   0%|          | 0.00/393M [00:00<?, ?B/s]

train-00004-of-00010.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

train-00005-of-00010.parquet:   0%|          | 0.00/404M [00:00<?, ?B/s]

train-00006-of-00010.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

train-00007-of-00010.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00008-of-00010.parquet:   0%|          | 0.00/851M [00:00<?, ?B/s]

train-00009-of-00010.parquet:   0%|          | 0.00/666M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/136341 [00:00<?, ? examples/s]

Large dataset - split carefully, no batch=True mapping on entire dataset to save memory/time

In [None]:
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_val = split_dataset['train']
test = split_dataset['test']
split_train_val = train_val.train_test_split(test_size=0.1, seed=42)
train = split_train_val['train']
val = split_train_val['test']

In [None]:
dataset = DatasetDict({
    "train": train,
    "validation": val,
    "test": test
})

---------------------------------------------------<br>
🟨 Encode multi-word class labels to integers<br>
---------------------------------------------------

In [None]:
unique_labels = sorted(set(dataset["train"]["txt"]))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
num_classes = len(label2id)

In [None]:
def encode_label(example):
    example["label"] = label2id[example["txt"]]
    return example

In [None]:
for split in ["train", "validation", "test"]:
    dataset[split] = dataset[split].map(encode_label)

Map:   0%|          | 0/98164 [00:00<?, ? examples/s]

Map:   0%|          | 0/10908 [00:00<?, ? examples/s]

Map:   0%|          | 0/27269 [00:00<?, ? examples/s]

---------------------------------------------------<br>
🎨 Image transforms (convert grayscale to RGB)<br>
---------------------------------------------------

In [None]:
def to_rgb(image):
    return image.convert("RGB")

In [None]:
data_transforms = {
    "train": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.Lambda(to_rgb),  # convert grayscale to RGB
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ]),
    "validation": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.Lambda(to_rgb),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ]),
    "test": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.Lambda(to_rgb),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
}

---------------------------------------------------<br>
Custom Dataset wrapper to apply transforms on the fly<br>
---------------------------------------------------

In [None]:
from torch.utils.data import Dataset

In [None]:
class HuggingFaceDataset(Dataset):
    def __init__(self, hf_dataset, transform):
        self.dataset = hf_dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item["image"]
        label = item["label"]
        image = self.transform(image)
        return image, label

Wrap datasets to apply transforms on-the-fly instead of map (memory efficient for large datasets)

In [None]:
train_dataset = HuggingFaceDataset(dataset["train"], data_transforms["train"])
val_dataset = HuggingFaceDataset(dataset["validation"], data_transforms["validation"])
test_dataset = HuggingFaceDataset(dataset["test"], data_transforms["test"])

---------------------------------------------------<br>
📦 DataLoaders with multiple workers for speed<br>
---------------------------------------------------

In [None]:
batch_size = 16  # increase batch size from 4 for better throughput if memory allows
num_workers = 2  # adjust based on your CPU cores

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

---------------------------------------------------<br>
🧠 CNN Model (no changes)<br>
---------------------------------------------------

In [None]:
class MedicalCNN(nn.Module):
    def __init__(self, num_classes):
        super(MedicalCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(32 * 56 * 56, 64)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(64, num_classes)
    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu3(self.fc1(x))
        x = self.fc2(x)
        return x

---------------------------------------------------<br>
🏋️ Training Setup (no changes)<br>
---------------------------------------------------

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MedicalCNN(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

---------------------------------------------------<br>
🔁 Training and Validation Loop (no changes)<br>
---------------------------------------------------

In [None]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)
    model.eval()
    val_loss = 0.0
    val_correct = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            val_correct += torch.sum(preds == labels)
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    val_accuracy = val_correct.double() / len(val_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}] | "
          f"Train Loss: {train_loss:.4f} | "
          f"Val Loss: {val_loss:.4f} | "
          f"Val Accuracy: {val_accuracy:.4f}")

Epoch [1/10] | Train Loss: 1.8421 | Val Loss: 1.3569 | Val Accuracy: 0.5466
Epoch [2/10] | Train Loss: 1.2188 | Val Loss: 1.2000 | Val Accuracy: 0.5915
Epoch [3/10] | Train Loss: 1.0275 | Val Loss: 1.0483 | Val Accuracy: 0.6459
Epoch [4/10] | Train Loss: 0.8915 | Val Loss: 1.2657 | Val Accuracy: 0.5907
Epoch [5/10] | Train Loss: 0.7853 | Val Loss: 1.0675 | Val Accuracy: 0.6542
Epoch [6/10] | Train Loss: 0.6879 | Val Loss: 1.0439 | Val Accuracy: 0.6630
Epoch [7/10] | Train Loss: 0.6227 | Val Loss: 1.1095 | Val Accuracy: 0.6664
Epoch [8/10] | Train Loss: 0.5598 | Val Loss: 1.1811 | Val Accuracy: 0.6657
Epoch [9/10] | Train Loss: 0.5015 | Val Loss: 1.2243 | Val Accuracy: 0.6684
Epoch [10/10] | Train Loss: 0.4547 | Val Loss: 1.2864 | Val Accuracy: 0.6733
