In [None]:
import os

dataset_path = "../FIVES A Fundus Image Dataset for AI-based Vessel Segmentation"

# Walk through all subfolders and count only PNGs
counts = {}
for root, dirs, files in os.walk(dataset_path):
    png_count = len([f for f in files if f.lower().endswith(".png")])
    if png_count > 0:
        rel_path = os.path.relpath(root, dataset_path)  # relative path inside dataset
        counts[rel_path] = png_count

counts


In [None]:
import os

base_path = "/data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation"

train_orig = os.path.join(base_path, "train/Original")
train_gt   = os.path.join(base_path, "train/Ground truth")

orig_files = sorted([f for f in os.listdir(train_orig) if f.lower().endswith(".png")])
gt_files   = sorted([f for f in os.listdir(train_gt) if f.lower().endswith(".png")])

# Compare sets
missing_in_gt = set(orig_files) - set(gt_files)
extra_in_gt = set(gt_files) - set(orig_files)

len(missing_in_gt), len(extra_in_gt)


In [None]:
import os

base_path = "/data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation"

train_orig = os.path.join(base_path, "train/Original")
train_gt   = os.path.join(base_path, "train/Ground truth")

# Collect only PNG filenames
orig_files = sorted([f for f in os.listdir(train_orig) if f.lower().endswith(".png")])
gt_files   = sorted([f for f in os.listdir(train_gt) if f.lower().endswith(".png")])

# Find mismatches
extra_in_gt = set(gt_files) - set(orig_files)

# Show how many + some sample names
print("Extra ground truth files:", len(extra_in_gt))
print(list(extra_in_gt)[:20])  # show first 20 for inspection


In [None]:
import os

base_path = "/data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation"

train_orig = os.path.join(base_path, "train/Original")
train_gt   = os.path.join(base_path, "train/Ground truth")

# Get only PNGs
orig_files = set([f for f in os.listdir(train_orig) if f.lower().endswith(".png")])
gt_files   = set([f for f in os.listdir(train_gt) if f.lower().endswith(".png")])

# Find extra ground truth files
extra_in_gt = gt_files - orig_files

print(f"Found {len(extra_in_gt)} redundant ground truth files.")

# Preview first 20 for confirmation
print("Examples:", list(extra_in_gt)[:20])

# --- Remove redundant files ---
for f in extra_in_gt:
    os.remove(os.path.join(train_gt, f))

print("✅ Redundant files removed.")


In [None]:
import os

base_path = "/data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation"

def check_integrity(split):
    orig_path = os.path.join(base_path, split, "Original")
    gt_path   = os.path.join(base_path, split, "Ground truth")

    orig_files = set([f for f in os.listdir(orig_path) if f.lower().endswith(".png")])
    gt_files   = set([f for f in os.listdir(gt_path) if f.lower().endswith(".png")])

    missing_in_gt = orig_files - gt_files
    missing_in_orig = gt_files - orig_files

    print(f"--- {split.upper()} ---")
    print(f"Original count: {len(orig_files)}")
    print(f"Ground Truth count: {len(gt_files)}")
    print(f"Missing in GT: {len(missing_in_gt)}")
    print(f"Missing in Original: {len(missing_in_orig)}")

    # Show some sample mismatches if any
    if missing_in_gt:
        print("Examples missing in GT:", list(missing_in_gt)[:5])
    if missing_in_orig:
        print("Examples missing in Original:", list(missing_in_orig)[:5])
    print()

# Run checks for both
check_integrity("train")
check_integrity("test")


In [None]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Paths (update if needed)
base_path = "/data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation"
train_gt = os.path.join(base_path, "train/Ground truth")
test_gt = os.path.join(base_path, "test/Ground truth")
excel_path = os.path.join(base_path, "Quality Assessment.xlsx")


In [None]:
import os

print("Base path exists:", os.path.exists(base_path))
print("Train GT path exists:", os.path.exists(train_gt))
print("Test GT path exists:", os.path.exists(test_gt))
print("Excel file exists:", os.path.exists(excel_path))
print("\nSample Train GT files:", os.listdir(train_gt)[:5])
print("Sample Test GT files:", os.listdir(test_gt)[:5])


In [None]:
# Load both sheets from Excel
df_train = pd.read_excel(excel_path, sheet_name="Train")
df_test  = pd.read_excel(excel_path, sheet_name="Test")

# Map disease to numeric labels
label_map = {"A":0, "D":1, "G":2, "N":3}
df_train["label"] = df_train["Disease"].map(label_map)
df_test["label"]  = df_test["Disease"].map(label_map)

print("Train sheet sample:")
print(df_train.head())

print("\nTest sheet sample:")
print(df_test.head())


In [None]:
class FundusMaskDataset(Dataset):
    def __init__(self, img_dir, df, transform=None):
        self.img_dir = img_dir
        self.df = df
        self.transform = transform
        self.samples = []

        for fname in os.listdir(img_dir):
            if fname.endswith(".png"):
                # Format: Number_Disease.png (e.g., 1_A.png)
                number = int(fname.split("_")[0])
                disease = fname.split("_")[1].split(".")[0]

                # Match with sheet data
                row = df[(df["Disease"] == disease) & (df["Number"] == number)]
                if not row.empty:
                    label = int(row["label"].values[0])
                    self.samples.append((fname, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        fname, label = self.samples[idx]
        img_path = os.path.join(self.img_dir, fname)

        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)

        return img, label


In [None]:
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
])

train_dataset = FundusMaskDataset(train_gt, df_train, transform=transform)
test_dataset  = FundusMaskDataset(test_gt, df_test, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)

print("Train samples:", len(train_dataset))
print("Test samples:", len(test_dataset))


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, 4)  # 4 classes (A, D, G, N)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [None]:
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, correct = 0, 0
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.argmax(1)
        correct += (preds == labels).sum().item()

    acc = correct / len(loader.dataset)
    return total_loss/len(loader), acc

def evaluate(model, loader, criterion):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for imgs, labels in loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = outputs.argmax(1)
            correct += (preds == labels).sum().item()
    acc = correct / len(loader.dataset)
    return total_loss/len(loader), acc


In [None]:
for epoch in range(5):  # run a few epochs first
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    print(f"Epoch {epoch+1}: Train Acc={train_acc:.2f}, Test Acc={test_acc:.2f}")


In [None]:
save_path = "/../fundus_baseline_resnet18.pth"
torch.save(model.state_dict(), save_path)
print("Model saved at:", save_path)


In [None]:
import numpy as np

def show_predictions(model, loader, n=6):
    model.eval()
    imgs, labels = next(iter(loader))
    imgs, labels = imgs.to(device), labels.to(device)
    with torch.no_grad():
        outputs = model(imgs)
        preds = outputs.argmax(1)

    imgs = imgs.cpu().numpy().transpose(0,2,3,1)

    plt.figure(figsize=(12,6))
    for i in range(n):
        plt.subplot(2, n//2, i+1)
        plt.imshow(imgs[i])
        plt.axis("off")
        true_lbl = list(label_map.keys())[list(label_map.values()).index(labels[i].item())]
        pred_lbl = list(label_map.keys())[list(label_map.values()).index(preds[i].item())]
        plt.title(f"T:{true_lbl}, P:{pred_lbl}")
    plt.show()

# Show predictions
show_predictions(model, test_loader, n=6)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np

def plot_confusion_matrix(model, loader, classes):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for imgs, labels in loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            preds = outputs.argmax(1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    cm = confusion_matrix(all_labels, all_preds, labels=list(range(len(classes))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    disp.plot(cmap="Blues", values_format="d")
    plt.title("Confusion Matrix")
    plt.show()

# Define class names (match Excel mapping)
class_names = ["A (AMD)", "D (DR)", "G (Glaucoma)", "N (Normal)"]

plot_confusion_matrix(model, test_loader, class_names)


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from PIL import Image

# Paths (update if needed)
base_path = "/../FIVES A Fundus Image Dataset for AI-based Vessel Segmentation"
train_original = os.path.join(base_path, "train/Original")
train_segmented = os.path.join(base_path, "train/Ground truth")
test_original = os.path.join(base_path, "test/Original")
test_segmented = os.path.join(base_path, "test/Ground truth")
excel_path = os.path.join(base_path, "Quality Assessment.xlsx")


In [None]:
df_train = pd.read_excel(excel_path, sheet_name="Train")
df_test  = pd.read_excel(excel_path, sheet_name="Test")

print("Train Sheet:\n", df_train.head())
print("\nTest Sheet:\n", df_test.head())


In [None]:
label_map = {"A": "AMD", "D": "DR", "G": "Glaucoma", "N": "Normal"}

df_train["DiseaseName"] = df_train["Disease"].map(label_map)
df_test["DiseaseName"] = df_test["Disease"].map(label_map)

fig, axes = plt.subplots(1, 2, figsize=(14,6))

sns.countplot(data=df_train, x="DiseaseName", ax=axes[0], palette="viridis")
axes[0].set_title("Train Set Disease Distribution")
axes[0].set_ylabel("Count")
axes[0].set_xlabel("Disease")

sns.countplot(data=df_test, x="DiseaseName", ax=axes[1], palette="magma")
axes[1].set_title("Test Set Disease Distribution")
axes[1].set_ylabel("Count")
axes[1].set_xlabel("Disease")

plt.show()


In [None]:
def count_pngs(path):
    return len([f for f in os.listdir(path) if f.endswith(".png")])

counts = {
    "Train Original": count_pngs(train_original),
    "Train Segmented": count_pngs(train_segmented),
    "Test Original": count_pngs(test_original),
    "Test Segmented": count_pngs(test_segmented)
}

print("Image counts per folder:")
for k,v in counts.items():
    print(f"{k}: {v}")


In [None]:
def show_random_pairs(n=4):
    files = random.sample(os.listdir(train_original), n)
    plt.figure(figsize=(12, 6))
    for i, fname in enumerate(files):
        # Paths
        orig_path = os.path.join(train_original, fname)
        seg_path = os.path.join(train_segmented, fname)

        # Open images
        orig = Image.open(orig_path).convert("RGB")
        seg = Image.open(seg_path).convert("L")

        # Plot original
        plt.subplot(2, n, i+1)
        plt.imshow(orig)
        plt.axis("off")
        plt.title(f"Original: {fname}")

        # Plot segmented
        plt.subplot(2, n, n+i+1)
        plt.imshow(seg, cmap="gray")
        plt.axis("off")
        plt.title("Segmented")

    plt.tight_layout()
    plt.show()

show_random_pairs(n=4)


In [None]:
sizes = []
for fname in os.listdir(train_original):
    if fname.endswith(".png"):
        img = Image.open(os.path.join(train_original, fname))
        sizes.append(img.size)

df_sizes = pd.DataFrame(sizes, columns=["Width", "Height"])

plt.figure(figsize=(10,5))
sns.histplot(df_sizes["Width"], bins=20, kde=True, color="blue", label="Width")
sns.histplot(df_sizes["Height"], bins=20, kde=True, color="orange", label="Height")
plt.legend()
plt.title("Image Size Distribution (Train Original)")
plt.show()


# CLASSIFICATION USING RAW FUNDUS IMAGE

In [None]:
# Core
import os, re, random, numpy as np, pandas as pd
from PIL import Image
import matplotlib.pyplot as plt

# Torch / Vision
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

# Metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Progress bar
from tqdm.auto import tqdm

# ---- PATHS: update base_path if needed ----
base_path = "/data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation"
train_original = os.path.join(base_path, "train/Original")
test_original  = os.path.join(base_path, "test/Original")
excel_path     = os.path.join(base_path, "Quality Assessment.xlsx")

print("Base exists:", os.path.exists(base_path))
print("Train Original exists:", os.path.exists(train_original))
print("Test Original exists:", os.path.exists(test_original))
print("Excel exists:", os.path.exists(excel_path))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


In [None]:
# Sheets named exactly "Train" and "Test"
df_train = pd.read_excel(excel_path, sheet_name="Train")
df_test  = pd.read_excel(excel_path, sheet_name="Test")

# Map disease codes to integers
label_map = {"A":0, "D":1, "G":2, "N":3}
inv_label_map = {v:k for k,v in label_map.items()}

df_train["Disease"] = df_train["Disease"].astype(str).str.upper()
df_test["Disease"]  = df_test["Disease"].astype(str).str.upper()
df_train["label"] = df_train["Disease"].map(label_map)
df_test["label"]  = df_test["Disease"].map(label_map)

print("Train sheet sample:\n", df_train.head(3))
print("\nTest sheet sample:\n", df_test.head(3))


In [None]:
class FundusOriginalDataset(Dataset):
    """
    Loads ORIGINAL RGB images for classification.
    Expects filenames like:  1_A.png, 23_D.png, etc.
    Matches (Number, Disease) to Excel row to get label.
    """
    def __init__(self, img_dir, df, transform=None):
        self.img_dir = img_dir
        self.df = df.copy()
        self.transform = transform
        self.samples = []   # list of (fname, label)

        pattern = re.compile(r"^\s*(\d+)_([ADGNadgn])\.(png|PNG)$")
        files = [f for f in os.listdir(img_dir) if f.lower().endswith(".png")]

        for fname in files:
            m = pattern.match(fname)
            if not m:
                continue
            number = int(m.group(1))
            disease = m.group(2).upper()

            row = self.df[(self.df["Number"] == number) & (self.df["Disease"] == disease)]
            if not row.empty:
                label = int(row["label"].values[0])
                self.samples.append((fname, label))

        # Basic sanity
        if len(self.samples) == 0:
            print(f"[WARN] No samples matched in {img_dir}. Check naming and Excel mapping.")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        fname, label = self.samples[idx]
        path = os.path.join(self.img_dir, fname)
        img = Image.open(path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, label, fname


In [None]:
# ImageNet normalization for pretrained backbones
imagenet_mean = [0.485, 0.456, 0.406]
imagenet_std  = [0.229, 0.224, 0.225]

train_tfms = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomResizedCrop(224, scale=(0.9, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=imagenet_mean, std=imagenet_std),
])

test_tfms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=imagenet_mean, std=imagenet_std),
])

train_ds = FundusOriginalDataset(train_original, df_train, transform=train_tfms)
test_ds  = FundusOriginalDataset(test_original,  df_test,  transform=test_tfms)

batch_size = 16
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

print("Train samples:", len(train_ds))
print("Test samples:", len(test_ds))

# Peek a few mappings
for i in range(min(5, len(train_ds))):
    _, lbl, fname = train_ds[i]
    print(f"Example -> {fname}  => label {lbl} ({inv_label_map[lbl]})")


In [None]:
# Compute class weights from the *actual loaded* training samples (handles missing files)
labels_in_train = [lbl for _, lbl in train_ds.samples]
class_counts = np.bincount(labels_in_train, minlength=4)
class_weights = (len(labels_in_train) / (4.0 * np.maximum(class_counts, 1))).astype(np.float32)
print("Class counts:", class_counts)
print("Class weights:", class_weights)

weights_tensor = torch.tensor(class_weights, dtype=torch.float32, device=device)

# Model
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.fc = nn.Linear(model.fc.in_features, 4)
model = model.to(device)

criterion = nn.CrossEntropyLoss(weight=weights_tensor)
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [None]:
def train_one_epoch(model, loader, optimizer, criterion, epoch):
    model.train()
    running_loss, correct, total = 0.0, 0, 0
    pbar = tqdm(loader, desc=f"Epoch {epoch} [train]", leave=False)
    for imgs, labels, _ in pbar:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        preds = out.argmax(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        pbar.set_postfix(loss=running_loss/total, acc=correct/total)
    return running_loss/total, correct/total

@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0
    for imgs, labels, _ in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        out = model(imgs)
        loss = criterion(out, labels)
        running_loss += loss.item() * imgs.size(0)
        preds = out.argmax(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    return running_loss/total, correct/total


In [None]:
epochs = 5
for ep in range(1, epochs+1):
    tr_loss, tr_acc = train_one_epoch(model, train_loader, optimizer, criterion, ep)
    te_loss, te_acc = evaluate(model, test_loader, criterion)
    print(f"Epoch {ep:02d} | train: loss {tr_loss:.4f}, acc {tr_acc:.3f} | test: loss {te_loss:.4f}, acc {te_acc:.3f}")


In [None]:
@torch.no_grad()
def preds_and_labels(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    for imgs, labels, _ in loader:
        imgs = imgs.to(device)
        out = model(imgs)
        all_preds.extend(out.argmax(1).cpu().numpy())
        all_labels.extend(labels.numpy())
    return np.array(all_preds), np.array(all_labels)

preds, gts = preds_and_labels(model, test_loader)
cm = confusion_matrix(gts, preds, labels=[0,1,2,3])
print("Confusion Matrix (rows=true A,D,G,N; cols=pred A,D,G,N):\n", cm)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["A","D","G","N"])
disp.plot(values_format="d", cmap="Blues")
plt.title("Original-only Baseline — Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# 1️⃣ Accuracy
acc = accuracy_score(gts, preds)
print("Accuracy:", acc)

# 2️⃣ Precision, Recall, F1-score (macro/micro/weighted)
# Macro: unweighted mean per class
# Micro: global metrics by counting all TP, FP, FN
# Weighted: mean per class weighted by support
precision = precision_score(gts, preds, average='macro')
recall = recall_score(gts, preds, average='macro')
f1 = f1_score(gts, preds, average='macro')
print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")

# 3️⃣ Per-class metrics
print("\nClassification Report:\n")
print(classification_report(gts, preds, target_names=["A","D","G","N"]))
