In [1]:
!unzip /content/CRC-VAL-HE-7K.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-ACMSDEFF.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-ACQQYLLS.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-ADCHTGEE.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-AFELDRPS.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-AFFMDFQV.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-AFQQTGKI.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-AGKPYMDE.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-AHDNMNIT.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-AHKLPKMS.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-AHQCDGMY.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-AIIGEWYP.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-ALLMHHRT.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-ALQTIPLF.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-APHIEAQK.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-AQGAYQML.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-ARHEISPN.tif  
  inflating: CRC-VAL-HE-7K/DEB/DEB-TCGA-ARIHITHS.tif  


In [2]:
# Install required packages
!pip install torchcam pytorch_lightning transformers datasets evaluate

Collecting torchcam
  Downloading torchcam-0.4.0-py3-none-any.whl.metadata (31 kB)
Collecting pytorch_lightning
  Downloading pytorch_lightning-2.5.2-py3-none-any.whl.metadata (21 kB)
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting numpy<2.0.0,>=1.17.2 (from torchcam)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.7.3-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0.0,>=2.0.0->torchcam)
  Downloading nvidia_cuda_nvrtc_cu12-

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms, models
import pytorch_lightning as pl
from transformers import ViTModel
from sklearn.metrics import accuracy_score, cohen_kappa_score, roc_auc_score

In [2]:
# Binary label mapping for tissue classes:
# 0 = non-cancerous/normal tissue
# 1 = cancerous/tumor-related tissue
# 
# Mapping:
# 'ADI'  - Adipose tissue (0)
# 'BACK' - Background (0)
# 'DEB'  - Debris (0)
# 'LYM'  - Lymphocytes (0)
# 'MUC'  - Mucus (0)
# 'MUS'  - Muscle (0)
# 'NORM' - Normal colon mucosa (0)
# 'STR'  - Stroma, often altered in tumors (1)
# 'TUM'  - Tumor epithelium (1)
binary_map = {'ADI':0,'BACK':0,'DEB':0,'LYM':0,'MUC':0,'MUS':0,'NORM':0,'STR':1,'TUM':1}

# Adjust dataset_dir path as needed
dataset_dir = '/content/CRC-VAL-HE-7K'
rows = []

for cls in os.listdir(dataset_dir):
    p = os.path.join(dataset_dir, cls)
    if os.path.isdir(p) and cls in binary_map:
        for f in os.listdir(p):
            if f.lower().endswith(('.tif','.png','.jpg')):
                rows.append([os.path.join(p, f), binary_map[cls], cls])

df = pd.DataFrame(rows, columns=['filepath','label','tissue_class'])
df.to_csv('colon_binary_labels.csv', index=False)
print(df['label'].value_counts())

label
0    5526
1    1654
Name: count, dtype: int64


In [3]:
df = pd.read_csv('colon_binary_labels.csv')
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])

class ColonDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row.filepath).convert('RGB')
        if self.transform: img = self.transform(img)
        return {'pixel_values': img, 'labels': torch.tensor(int(row.label))}


In [4]:
train_ds = ColonDataset(train_df, transform)
val_ds = ColonDataset(val_df, transform)

w = 1.0 / np.bincount(train_df['label'])
sam = WeightedRandomSampler(w[train_df['label']], len(train_df))

train_loader = DataLoader(train_ds, batch_size=32, sampler=sam, num_workers=4)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=4)




In [5]:
class CNN_ViT_Hybrid(pl.LightningModule):
    def __init__(self, lr=5e-5):
        super().__init__()
        self.save_hyperparameters()
        cnn = models.resnet50(weights="IMAGENET1K_V1")
        self.cnn = nn.Sequential(*list(cnn.children())[:-2])
        self.conv1x1 = nn.Conv2d(2048, 3, kernel_size=1)
        self.up = nn.Upsample((224,224), mode='bilinear', align_corners=False)
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.fc = nn.Linear(self.vit.config.hidden_size, 2)
        self.ce = nn.CrossEntropyLoss()

    def forward(self, x):
        f = self.cnn(x)
        f = self.up(self.conv1x1(f))
        s = self.vit(pixel_values=f).last_hidden_state[:,0,:]
        return self.fc(s)

    def training_step(self, b,_):
        logits = self(b['pixel_values'])
        loss = self.ce(logits, b['labels'])
        acc = (logits.argmax(1)==b['labels']).float().mean()
        self.log('train_loss', loss); self.log('train_acc', acc)
        return loss

    def configure_optimizers(self): return torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)


In [6]:
# Returns accuracy, Cohen's Kappa, and AUC:
# - accuracy_score: basic performance metric
# - cohen_kappa_score: measures agreement beyond chance, important for imbalanced datasets
# - The kappa statistic, which is a number between -1 and 1. The maximum value means complete agreement; zero or lower means chance agreement.
# - roc_auc_score: evaluates ranking quality of predicted probabilities

from sklearn.metrics import accuracy_score, cohen_kappa_score, roc_auc_score

def eval_model(model, loader):
    model.eval()
    preds, probs, labs = [], [], []
    with torch.no_grad():
        for b in loader:
            logits = model(b['pixel_values'])
            probs.extend(torch.softmax(logits,1)[:,1].cpu().tolist())
            p = logits.argmax(1).cpu().tolist()
            preds.extend(p); labs.extend(b['labels'].cpu().tolist())
    return accuracy_score(labs, preds), cohen_kappa_score(labs, preds), roc_auc_score(labs, probs)


In [8]:

# For hybrid:
model2 = CNN_ViT_Hybrid()
trainer = pl.Trainer(max_epochs=2, accelerator='gpu' if torch.cuda.is_available() else 'cpu')
trainer.fit(model2, train_loader, val_loader)

acc2, kappa2, auc2 = eval_model(model2, val_loader)
print("CNN‚ÄìViT Hybrid ‚Üí Acc: {:.4f}, Kappa: {:.4f}, AUC: {:.4f}".format(acc2, kappa2, auc2))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | cnn     | Sequential       | 23.5 M | train
1 | conv1x1 | Conv2d           | 6.1 K  | t

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


CNN‚ÄìViT Hybrid ‚Üí Acc: 0.9937, Kappa: 0.9825, AUC: 0.9998


In [9]:
print("CNN‚ÄìViT Hybrid ‚Üí Acc: {:.4f}, Kappa: {:.4f}, AUC: {:.4f}".format(acc2, kappa2, auc2))

CNN‚ÄìViT Hybrid ‚Üí Acc: 0.9937, Kappa: 0.9825, AUC: 0.9998


In [7]:
class ResNetClassifier(pl.LightningModule):
    def __init__(self, lr=5e-5):
        super().__init__()
        self.save_hyperparameters()
        self.model = models.resnet50(weights="IMAGENET1K_V1")
        self.model.fc = nn.Linear(self.model.fc.in_features, 2)
        self.ce = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, b, _):
        logits = self(b['pixel_values'])
        loss = self.ce(logits, b['labels'])
        acc = (logits.argmax(1) == b['labels']).float().mean()
        self.log('train_loss', loss); self.log('train_acc', acc)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)

# Train and evaluate ResNet model
model1 = ResNetClassifier()
trainer = pl.Trainer(max_epochs=1, accelerator='gpu' if torch.cuda.is_available() else 'cpu')
trainer.fit(model1, train_loader, val_loader)

acc1, kappa1, auc1 = eval_model(model1, val_loader)
print("ResNet50 ‚Üí Acc: {:.4f}, Kappa: {:.4f}, AUC: {:.4f}".format(acc1, kappa1, auc1))

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 97.8M/97.8M [00:01<00:00, 77.2MB/s]
INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.mo

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


ResNet50 ‚Üí Acc: 0.9909, Kappa: 0.9748, AUC: 0.9999
