# ConvNeXt V2 Skin Lesion Classification
This notebook builds a ConvNeXt V2 baseline for the ISIC 2018 lesion classification task, covering data preparation, model assembly, training, evaluation, and inference.

## 1. Setup and Imports

In [1]:
import math
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, Dataset

from torchvision import transforms
from torchvision.transforms import InterpolationMode
from transformers import AutoImageProcessor, ConvNextV2ForImageClassification


from sklearn.metrics import classification_report, confusion_matrix

torch.manual_seed(42)


2025-11-18 16:24:17.013292: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763483057.267129      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763483057.334202      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

<torch._C.Generator at 0x7a805fb298d0>

## 2. Configuration

In [2]:
@dataclass
class Config:
    data_root: Path
    train_csv: Path
    val_csv: Path
    test_csv: Optional[Path] = None
    image_size: int = 224
    num_classes: int = 7
    batch_size: int = 16
    base_lr: float = 1e-4
    weight_decay: float = 0.01
    warmup_epochs: int = 1
    max_epochs: int = 10
    label_smoothing: float = 0.05
    mixup_alpha: float = 0.0
    cutmix_alpha: float = 0.0
    amp: bool = True
    gradient_clip_value: Optional[float] = 1.0
    t_max_ratio: float = 1.0

config = Config(
    data_root=Path("/kaggle/input/multi-task-learning-isic-challenge/dataset/classification"),
    train_csv=Path("/kaggle/input/multi-task-learning-isic-challenge/dataset/classification/train/ground_truth/ISIC2018_Task3_Training_GroundTruth.csv"),
    val_csv=Path("/kaggle/input/multi-task-learning-isic-challenge/dataset/classification/val/ground_truth/ISIC2018_Task3_Validation_GroundTruth.csv"),
    test_csv=Path("/kaggle/input/multi-task-learning-isic-challenge/dataset/classification/test/ground_truth/ISIC2018_Task3_Test_GroundTruth.csv"),
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

image_processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-large-22k-224")
if "shortest_edge" in image_processor.size:
    image_processor.size["shortest_edge"] = config.image_size


preprocessor_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


## 3. Dataset and DataLoaders

In [3]:
def build_transforms(cfg: Config) -> Tuple[transforms.Compose, transforms.Compose]:
    train_tfms = transforms.Compose([
        transforms.RandomResizedCrop(cfg.image_size, scale=(0.75, 1.0), interpolation=InterpolationMode.BICUBIC),
        transforms.RandomHorizontalFlip(),
    ])

    val_tfms = transforms.Compose([
        transforms.Resize(int(cfg.image_size * 1.14), interpolation=InterpolationMode.BICUBIC),
        transforms.CenterCrop(cfg.image_size),
    ])

    return train_tfms, val_tfms


class ISICDataset(Dataset):
    def __init__(self, df: pd.DataFrame, image_dir: Path, transform: transforms.Compose, processor: AutoImageProcessor):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform
        self.processor = processor
        self.label_cols = [col for col in df.columns if col != "image"]
        self.targets = self.df[self.label_cols].values.astype(np.float32)
        self.images = self.df["image"].tolist()

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        image_id = self.images[idx]
        image_path = self.image_dir / f"{image_id}.jpg"
        img = Image.open(image_path).convert("RGB")
        if self.transform is not None:
            img = self.transform(img)
        inputs = self.processor(images=img, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)
        label_idx = int(self.targets[idx].argmax())
        target = torch.tensor(label_idx, dtype=torch.long)
        return pixel_values, target


def build_dataloaders(cfg: Config) -> Tuple[DataLoader, DataLoader, Optional[DataLoader], torch.Tensor, List[str]]:
    train_df = pd.read_csv(cfg.train_csv)
    val_df = pd.read_csv(cfg.val_csv)

    label_cols = [col for col in train_df.columns if col != "image"]
    class_counts = train_df[label_cols].sum().values + 1e-6
    class_weights = (class_counts.sum() / class_counts)
    class_weights = torch.tensor(class_weights, dtype=torch.float32, device=device)

    train_tfms, val_tfms = build_transforms(cfg)

    train_dataset = ISICDataset(
        train_df,
        cfg.data_root / "train" / "input",
        transform=train_tfms,
        processor=image_processor,
    )
    val_dataset = ISICDataset(
        val_df,
        cfg.data_root / "val" / "input",
        transform=val_tfms,
        processor=image_processor,
    )

    test_loader: Optional[DataLoader] = None
    if cfg.test_csv is not None and cfg.test_csv.exists():
        test_df = pd.read_csv(cfg.test_csv)
        test_dataset = ISICDataset(
            test_df,
            cfg.data_root / "test" / "input",
            transform=val_tfms,
            processor=image_processor,
        )
        test_loader = DataLoader(
            test_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            drop_last=False,
        )

    train_loader = DataLoader(
        train_dataset,
        batch_size=cfg.batch_size,
        shuffle=True,
        drop_last=True,
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=cfg.batch_size,
        shuffle=False,
        drop_last=False,
    )

    return train_loader, val_loader, test_loader, class_weights, label_cols


train_loader, val_loader, test_loader, class_weights, label_names = build_dataloaders(config)

## 4. Model Definition

In [4]:
id2label = {idx: name for idx, name in enumerate(label_names)}
label2id = {name: idx for idx, name in id2label.items()}

class ConvNeXtV2Classifier(nn.Module):
    def __init__(self, num_classes: int, id2label: Dict[int, str], label2id: Dict[str, int]):
        super().__init__()
        self.backbone = ConvNextV2ForImageClassification.from_pretrained(
            "facebook/convnextv2-large-22k-224",
            num_labels=num_classes,
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True,
        )

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        outputs = self.backbone(pixel_values=pixel_values)
        return outputs.logits


model = ConvNeXtV2Classifier(config.num_classes, id2label, label2id).to(device)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/792M [00:00<?, ?B/s]

Some weights of ConvNextV2ForImageClassification were not initialized from the model checkpoint at facebook/convnextv2-large-22k-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 1536]) in the checkpoint and torch.Size([7, 1536]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5. Training Utilities

In [5]:
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self) -> None:
        self.val = 0.0
        self.avg = 0.0
        self.sum = 0.0
        self.count = 0

    def update(self, val: float, n: int = 1) -> None:
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count if self.count else 0.0


def accuracy(logits: torch.Tensor, targets: torch.Tensor, topk: Tuple[int, ...] = (1,)) -> List[torch.Tensor]:
    maxk = max(topk)
    batch_size = targets.size(0)
    _, pred = logits.topk(maxk, dim=1, largest=True, sorted=True)
    pred = pred.t()
    correct = pred.eq(targets.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(1.0 / batch_size))
    return res


criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=config.label_smoothing).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=config.base_lr, weight_decay=config.weight_decay)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=int(config.max_epochs * config.t_max_ratio), eta_min=1e-6)
scaler = torch.amp.GradScaler('cuda',enabled=config.amp)

## 6. Training and Validation Loops

In [6]:
def train_one_epoch(epoch: int) -> Dict[str, float]:
    model.train()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    for step, (images, targets) in enumerate(train_loader):
        images = images.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        with autocast(enabled=config.amp):
            logits = model(images)
            loss = criterion(logits, targets)

        scaler.scale(loss).backward()
        if config.gradient_clip_value is not None:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip_value)
        scaler.step(optimizer)
        scaler.update()

        acc1, acc5 = accuracy(logits.detach(), targets, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1.item(), images.size(0))
        top5.update(acc5.item(), images.size(0))

    lr_scheduler.step()

    return {
        "epoch": epoch,
        "loss": losses.avg,
        "acc1": top1.avg * 100.0,
        "acc5": top5.avg * 100.0,
        "lr": optimizer.param_groups[0]["lr"],
    }


def validate(epoch: int) -> Dict[str, float]:
    model.eval()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    with torch.no_grad():
        for images, targets in val_loader:
            images = images.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)

            with autocast(enabled=config.amp):
                logits = model(images)
                loss = criterion(logits, targets)

            acc1, acc5 = accuracy(logits, targets, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1.item(), images.size(0))
            top5.update(acc5.item(), images.size(0))

    return {
        "epoch": epoch,
        "loss": losses.avg,
        "acc1": top1.avg * 100.0,
        "acc5": top5.avg * 100.0,
    }

## 7. Main Training Loop

In [7]:
history = {
    "train_loss": [],
    "train_acc1": [],
    "val_loss": [],
    "val_acc1": [],
}

best_acc = 0.0
best_state: Optional[Dict[str, torch.Tensor]] = None
artifact_dir = Path("artifacts/convnext_v2")
artifact_dir.mkdir(parents=True, exist_ok=True)
checkpoint_path = artifact_dir / "convnext_v2_classifier.pth"

for epoch in range(1, config.max_epochs + 1):
    train_metrics = train_one_epoch(epoch)
    val_metrics = validate(epoch)

    history["train_loss"].append(train_metrics["loss"])
    history["train_acc1"].append(train_metrics["acc1"])
    history["val_loss"].append(val_metrics["loss"])
    history["val_acc1"].append(val_metrics["acc1"])

    if val_metrics["acc1"] > best_acc:
        best_acc = val_metrics["acc1"]
        best_state = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scaler": scaler.state_dict(),
            "epoch": epoch,
            "config": config,
            "best_acc": best_acc,
        }
        torch.save(best_state, checkpoint_path)

    print(
        f"Epoch {epoch:03d} | "
        f"Train Loss: {train_metrics['loss']:.4f} | Train Acc@1: {train_metrics['acc1']:.2f}% | "
        f"Val Loss: {val_metrics['loss']:.4f} | Val Acc@1: {val_metrics['acc1']:.2f}%"
    )

print(f"Best Val Acc@1: {best_acc:.2f}% saved to {checkpoint_path}")

  with autocast(enabled=config.amp):
  with autocast(enabled=config.amp):


Epoch 001 | Train Loss: 1.9089 | Train Acc@1: 56.83% | Val Loss: 1.2590 | Val Acc@1: 76.68%
Epoch 002 | Train Loss: 1.4520 | Train Acc@1: 76.80% | Val Loss: 1.2717 | Val Acc@1: 80.83%
Epoch 003 | Train Loss: 1.2466 | Train Acc@1: 84.94% | Val Loss: 1.1534 | Val Acc@1: 85.49%
Epoch 004 | Train Loss: 1.1223 | Train Acc@1: 91.00% | Val Loss: 1.3185 | Val Acc@1: 83.42%
Epoch 005 | Train Loss: 1.0279 | Train Acc@1: 95.41% | Val Loss: 1.3236 | Val Acc@1: 84.46%
Epoch 006 | Train Loss: 0.9747 | Train Acc@1: 97.33% | Val Loss: 1.3877 | Val Acc@1: 90.16%
Epoch 007 | Train Loss: 0.9310 | Train Acc@1: 99.08% | Val Loss: 1.3800 | Val Acc@1: 88.60%
Epoch 008 | Train Loss: 0.9250 | Train Acc@1: 99.64% | Val Loss: 1.4431 | Val Acc@1: 89.64%
Epoch 009 | Train Loss: 0.9259 | Train Acc@1: 99.71% | Val Loss: 1.4305 | Val Acc@1: 90.16%
Epoch 010 | Train Loss: 0.9238 | Train Acc@1: 99.92% | Val Loss: 1.4112 | Val Acc@1: 90.67%
Best Val Acc@1: 90.67% saved to artifacts/convnext_v2/convnext_v2_classifier.pth

## 8. Evaluation Helpers

In [8]:
def evaluate(loader: DataLoader, class_names: List[str]) -> Dict[str, object]:
    model.eval()
    all_preds: List[int] = []
    all_targets: List[int] = []
    losses = AverageMeter()

    with torch.no_grad():
        for images, targets in loader:
            images = images.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)

            with autocast(enabled=config.amp):
                logits = model(images)
                loss = criterion(logits, targets)

            losses.update(loss.item(), images.size(0))
            preds = logits.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_targets.extend(targets.cpu().tolist())

    if not all_preds:
        return {
            "loss": losses.avg,
            "accuracy": 0.0,
            "report": "No samples to evaluate.",
            "confusion_matrix": np.zeros((len(class_names), len(class_names)), dtype=int),
        }

    accuracy = (np.array(all_preds) == np.array(all_targets)).mean() * 100.0
    report = classification_report(all_targets, all_preds, target_names=class_names, digits=4)
    conf_mat = confusion_matrix(all_targets, all_preds, labels=list(range(len(class_names))))
    return {
        "loss": losses.avg,
        "accuracy": accuracy,
        "report": report,
        "confusion_matrix": conf_mat
    }

In [9]:
def load_best_checkpoint(path: Path) -> Optional[Dict[str, torch.Tensor]]:
    if not path.exists():
        print(f"Checkpoint {path} not found.")
        return None
    state = torch.load(path, map_location=device)
    model.load_state_dict(state["model"])
    optimizer.load_state_dict(state["optimizer"])
    scaler.load_state_dict(state["scaler"])
    best_metric = state.get("best_acc", 0.0)
    global best_acc
    best_acc = max(best_acc, best_metric)
    print(
        f"Loaded checkpoint from epoch {state.get('epoch', 'N/A')} with "
        f"val acc {best_metric:.2f}%.")
    return state

## 9. Validation Summary

In [10]:
val_results = evaluate(val_loader, label_names)
print(f"Validation Accuracy: {val_results['accuracy']:.2f}%")
print(val_results["report"])
print("Confusion Matrix:\n", val_results["confusion_matrix"])

  with autocast(enabled=config.amp):


Validation Accuracy: 90.67%
              precision    recall  f1-score   support

         MEL     0.8947    0.8095    0.8500        21
          NV     0.9752    0.9593    0.9672       123
         BCC     0.8125    0.8667    0.8387        15
       AKIEC     0.5000    0.5000    0.5000         8
         BKL     0.7692    0.9091    0.8333        22
          DF     0.0000    0.0000    0.0000         1
        VASC     1.0000    1.0000    1.0000         3

    accuracy                         0.9067       193
   macro avg     0.7074    0.7207    0.7128       193
weighted avg     0.9060    0.9067    0.9053       193

Confusion Matrix:
 [[ 17   2   0   0   2   0   0]
 [  2 118   1   0   2   0   0]
 [  0   0  13   2   0   0   0]
 [  0   1   2   4   1   0   0]
 [  0   0   0   2  20   0   0]
 [  0   0   0   0   1   0   0]
 [  0   0   0   0   0   0   3]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 10. Test Set Evaluation

In [11]:
if test_loader is not None :
    test_results = evaluate(test_loader, label_names)
    print(f"Test Accuracy: {test_results['accuracy']:.2f}%")
    print(test_results["report"])
    print("Confusion Matrix (Test):\n", test_results["confusion_matrix"])

  with autocast(enabled=config.amp):


Test Accuracy: 83.73%
              precision    recall  f1-score   support

         MEL     0.6369    0.6257    0.6313       171
          NV     0.8862    0.9428    0.9136       909
         BCC     0.8214    0.7419    0.7797        93
       AKIEC     0.6500    0.6047    0.6265        43
         BKL     0.7850    0.7235    0.7530       217
          DF     0.9643    0.6136    0.7500        44
        VASC     0.9200    0.6571    0.7667        35

    accuracy                         0.8373      1512
   macro avg     0.8091    0.7013    0.7458      1512
weighted avg     0.8359    0.8373    0.8341      1512

Confusion Matrix (Test):
 [[107  46   1   2  13   0   2]
 [ 32 857   3   1  16   0   0]
 [  4   7  69   2  11   0   0]
 [  5   2   6  26   3   1   0]
 [ 19  30   5   6 157   0   0]
 [  1  14   0   2   0  27   0]
 [  0  11   0   1   0   0  23]]


# 11. Segmentation label inference

In [12]:
segmentation_root = Path("/kaggle/input/multi-task-learning-isic-challenge/dataset/segmentation")
prediction_rows: List[Dict[str, int]] = []
splits = ["train", "val", "test"]
model.eval()
with torch.no_grad():
    for split in splits:
        split_input_dir = segmentation_root / split / "input"
        if not split_input_dir.exists():
            continue
        for img_path in sorted(split_input_dir.glob("*.jpg")):
            img = Image.open(img_path).convert("RGB")
            processed = image_processor(images=img, return_tensors="pt")
            pixel_values = processed["pixel_values"].to(device)
            with autocast(enabled=config.amp):
                logits = model(pixel_values)
            pred_idx = logits.argmax(dim=1).item()
            one_hot = F.one_hot(torch.tensor(pred_idx), num_classes=len(label_names)).tolist()
            rel_path = img_path.relative_to(segmentation_root)
            row = {"image": rel_path.as_posix()}
            row.update({label: int(value) for label, value in zip(label_names, one_hot)})
            prediction_rows.append(row)
if prediction_rows:
    predictions_df = pd.DataFrame(prediction_rows)
    output_path = Path("artifacts/segmentation/convnext_v2_segmentation_predictions.csv")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    predictions_df.to_csv(output_path, index=False)
    print(f"Wrote {len(predictions_df)} predictions to {output_path}")
else:
    print("No segmentation images found for inference.")

  with autocast(enabled=config.amp):


Wrote 3694 predictions to artifacts/segmentation/convnext_v2_segmentation_predictions.csv
