In [None]:
!pip install kagglehub --quiet
!pip install torch torchvision torchaudio --upgrade --quiet
!pip install torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.0+cpu.html --quiet
!pip install scikit-learn opencv-python scikit-image matplotlib --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import kagglehub
import os

dataset_dir = kagglehub.dataset_download("paultimothymooney/breast-histopathology-images")
print("📁 Dataset loaded from:", dataset_dir)


📁 Dataset loaded from: /kaggle/input/breast-histopathology-images


In [None]:
from glob import glob
import random

def collect_image_paths(dataset_dir, max_patients=10, max_per_class=100):
    image_paths = []
    patient_dirs = sorted(os.listdir(dataset_dir))[:max_patients]

    for patient_id in patient_dirs:
        patient_path = os.path.join(dataset_dir, patient_id)
        for class_label in ['0', '1']:
            class_path = os.path.join(patient_path, class_label)
            if os.path.exists(class_path):
                images = glob(os.path.join(class_path, "*.png"))
                selected = random.sample(images, min(max_per_class, len(images)))
                image_paths += [(img_path, int(class_label)) for img_path in selected]
    return image_paths

image_data = collect_image_paths(dataset_dir, max_patients=10, max_per_class=100)
print("📊 Total labeled images:", len(image_data))


📊 Total labeled images: 1824


In [None]:
import torch
from torch_geometric.data import Data
from skimage.io import imread
from skimage.segmentation import slic
from skimage.color import rgb2lab
import numpy as np

def convert_image_to_graph(image_path, label, num_segments=75):
    image = imread(image_path)
    if image.shape[-1] == 4:
        image = image[:, :, :3]
    segments = slic(image, n_segments=num_segments, compactness=10, start_label=0)
    image_lab = rgb2lab(image)
    num_nodes = segments.max() + 1

    node_features = np.zeros((num_nodes, 3))
    for i in range(num_nodes):
        mask = segments == i
        node_features[i] = image_lab[mask].mean(axis=0)

    edge_set = set()
    H, W = segments.shape
    for i in range(H):
        for j in range(W):
            src = segments[i, j]
            for dx, dy in [(-1,0), (1,0), (0,-1), (0,1)]:
                ni, nj = i+dx, j+dy
                if 0 <= ni < H and 0 <= nj < W:
                    dst = segments[ni, nj]
                    if src != dst:
                        edge_set.add((src, dst))

    edge_index = torch.tensor(list(edge_set), dtype=torch.long).t().contiguous()
    x = torch.tensor(node_features, dtype=torch.float)
    y = torch.tensor([label], dtype=torch.long)

    return Data(x=x, edge_index=edge_index, y=y)




In [None]:
image_data = image_data[:1000]  # increase dataset
graph_data = [convert_image_to_graph(p, l, num_segments=60) for p, l in image_data]


In [None]:
graph_data = [convert_image_to_graph(p, l, num_segments=60) for p, l in image_data[:1000]]


In [None]:
from tqdm import tqdm
graph_data = [convert_image_to_graph(p, l, num_segments=60) for p, l in tqdm(image_data[:1000])]


100%|██████████| 200/200 [00:04<00:00, 46.35it/s]


In [None]:
!pip install scikit-image --quiet


In [None]:
from torch_geometric.loader import DataLoader
from torch.utils.data import random_split

total = len(graph_data)
train_len = int(0.7 * total)
val_len = int(0.15 * total)
test_len = total - train_len - val_len

train_set, val_set, test_set = random_split(graph_data, [train_len, val_len, test_len])
train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
val_loader = DataLoader(val_set, batch_size=8)
test_loader = DataLoader(test_set, batch_size=8)

print(f"📊 Dataset sizes → Train: {train_len}, Val: {val_len}, Test: {test_len}")


📊 Dataset sizes → Train: 140, Val: 30, Test: 30


In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, BatchNorm, global_mean_pool

class GCNEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)

    def forward(self, x, edge_index, batch):
        x = F.dropout(F.relu(self.bn1(self.conv1(x, edge_index))), p=0.3, training=self.training)
        x = F.dropout(F.relu(self.bn2(self.conv2(x, edge_index))), p=0.3, training=self.training)
        x = global_mean_pool(x, batch)
        return x

class AGCLModel(nn.Module):
    def __init__(self, in_channels=3, hidden_dim=64, proj_dim=64):
        super().__init__()
        self.encoder = GCNEncoder(in_channels, hidden_dim)
        self.proj_head = nn.Sequential(
            nn.Linear(hidden_dim, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )
        self.edge_mlp = nn.Sequential(
            nn.Linear(6, 32), nn.ReLU(), nn.Linear(32, 1), nn.Sigmoid()
        )
        self.classifier = nn.Linear(hidden_dim, 2)

    def forward(self, data, mask_edges=False):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        if mask_edges and self.training:
            row, col = edge_index
            edge_feat = torch.cat([x[row], x[col]], dim=1)
            edge_prob = self.edge_mlp(edge_feat).squeeze()
            keep = edge_prob.bernoulli().bool()
            edge_index = edge_index[:, keep]

        z = self.encoder(x, edge_index, batch)
        proj = self.proj_head(z)
        out = self.classifier(z)
        return proj, out


In [None]:
def contrastive_loss(z1, z2, temperature=0.5):
    z1 = F.normalize(z1, dim=1)
    z2 = F.normalize(z2, dim=1)
    sim = torch.mm(z1, z2.T) / temperature
    labels = torch.arange(z1.size(0)).to(z1.device)
    return F.cross_entropy(sim, labels)


In [None]:
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, roc_auc_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AGCLModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-4)
loss_fn = nn.CrossEntropyLoss()

def train_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        z1, out1 = model(batch, mask_edges=False)
        z2, _ = model(batch, mask_edges=True)

        loss_cls = loss_fn(out1, batch.y)
        loss_contrast = contrastive_loss(z1, z2)
        loss = loss_cls + 0.5 * loss_contrast

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, label="Test"):
    model.eval()
    y_true, y_pred, y_prob = [], [], []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            _, out = model(batch)
            probs = F.softmax(out, dim=1)
            preds = probs.argmax(dim=1)

            y_true.extend(batch.y.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            y_prob.extend(probs[:,1].cpu().numpy())

    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    kappa = cohen_kappa_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)
    print(f"📈 {label} → Acc: {acc:.4f} | F1-macro: {f1_macro:.4f} | F1-weighted: {f1_weighted:.4f} | Kappa: {kappa:.4f} | AUC: {auc:.4f}")
    return acc, f1_macro, f1_weighted, kappa, auc


In [None]:
for epoch in range(1, 21):
    loss = train_epoch(model, train_loader, optimizer)
    print(f"Epoch {epoch:02d} | 🔧 Loss: {loss:.4f}")
    if epoch % 5 == 0:
        evaluate(model, val_loader, "Validation")

print("✅ Final Evaluation on Test Set:")
evaluate(model, test_loader, "Test")


Epoch 01 | 🔧 Loss: 0.6177
Epoch 02 | 🔧 Loss: 0.6793
Epoch 03 | 🔧 Loss: 0.5815
Epoch 04 | 🔧 Loss: 0.6132
Epoch 05 | 🔧 Loss: 0.7278
📈 Validation → Acc: 0.9000 | F1-macro: 0.8330 | F1-weighted: 0.8894 | Kappa: 0.6715 | AUC: 0.8634
Epoch 06 | 🔧 Loss: 0.6537
Epoch 07 | 🔧 Loss: 0.6389
Epoch 08 | 🔧 Loss: 0.6001
Epoch 09 | 🔧 Loss: 0.6684
Epoch 10 | 🔧 Loss: 0.5859
📈 Validation → Acc: 0.8667 | F1-macro: 0.7600 | F1-weighted: 0.8453 | Kappa: 0.5349 | AUC: 0.8820
Epoch 11 | 🔧 Loss: 0.6721
Epoch 12 | 🔧 Loss: 0.6890
Epoch 13 | 🔧 Loss: 0.6529
Epoch 14 | 🔧 Loss: 0.6116
Epoch 15 | 🔧 Loss: 0.6966
📈 Validation → Acc: 0.9333 | F1-macro: 0.9068 | F1-weighted: 0.9333 | Kappa: 0.8137 | AUC: 0.8758
Epoch 16 | 🔧 Loss: 0.6715
Epoch 17 | 🔧 Loss: 0.6570
Epoch 18 | 🔧 Loss: 0.5632
Epoch 19 | 🔧 Loss: 0.6472
Epoch 20 | 🔧 Loss: 0.6389
📈 Validation → Acc: 0.9000 | F1-macro: 0.8527 | F1-weighted: 0.8972 | Kappa: 0.7059 | AUC: 0.8820
✅ Final Evaluation on Test Set:
📈 Test → Acc: 0.9000 | F1-macro: 0.8901 | F1-weighted: 0

(0.9,
 0.8901098901098901,
 0.898901098901099,
 np.float64(0.7804878048780488),
 np.float64(0.9856459330143541))