In [80]:
import pandas as pd
import os

csv_dir = "./dataset_csv/"
os.makedirs(csv_dir, exist_ok=True)

train_df = pd.read_csv("/media/nfs/LN/CAMELYON16/train_slides.csv")
test_df = pd.read_csv("/media/nfs/LN/CAMELYON16/test_slides.csv")
train_df.shape, test_df.shape

((270, 4), (129, 4))

In [71]:
patch_dir = "/media/nfs/LN/CAMELYON16/patches/"
patch_dir_normal = "/media/nfs/LN/CAMELYON16/SegPatches_224p/patches/"
patch_list = [i[:-3] for i in os.listdir(patch_dir)]
normal_tests = list(test_df["slide_id"][~test_df["slide_id"].isin(patch_list)].values)
normal_trains = list(train_df["slide_id"][~train_df["slide_id"].isin(patch_list)].values)
add_list = [i+".h5" for i in normal_tests+normal_trains if i+".h5" in os.listdir(patch_dir_normal)]
miss_list = [i+".h5" for i in normal_tests+normal_trains if i+".h5" not in os.listdir(patch_dir_normal)]
len(add_list), len(miss_list)

(80, 2)

In [72]:
miss_list

['normal_027.h5', 'normal_045.h5']

In [74]:
test_df["label"][test_df["slide_id"].isin([i[:-3] for i in  add_list])].unique()

array([0])

In [None]:
import shutil
for file in add_list:
    shutil.copyfile(patch_dir_normal + file, patch_dir + file)

In [81]:
patch_dir = "/media/nfs/LN/CAMELYON16/patches/"
patch_list = [i[:-3] for i in os.listdir(patch_dir)]
train_df = train_df[train_df["slide_id"].isin(patch_list)]
test_df = test_df[test_df["slide_id"].isin(patch_list)]
train_df.shape, test_df.shape

((268, 4), (129, 4))

In [82]:
import numpy as np
from h5py import File

nb_patches = []
for i in os.listdir(patch_dir):
    with File(patch_dir+i, "r") as hf:
        coords = np.array(hf["coords"])
    nb_patches.append([i[:-3], len(coords)])
    
patch_df = pd.DataFrame(nb_patches, columns=["slide_id", "nb_patches"])
patch_df.shape, patch_df["nb_patches"].min(), patch_df["nb_patches"].max()

((397, 2), 6, 188603)

In [83]:
train_df = pd.merge(train_df, patch_df, on="slide_id")
test_df = pd.merge(test_df, patch_df, on="slide_id")
train_df.shape, test_df.shape, train_df.isna().any().any(), test_df.isna().any().any()

((268, 5), (129, 5), False, False)

In [84]:
train_df

Unnamed: 0,slide_id,label,class,case_id,nb_patches
0,normal_001,0,negative,normal_001,8810
1,normal_002,0,negative,normal_002,11605
2,normal_003,0,negative,normal_003,27299
3,normal_004,0,negative,normal_004,6322
4,normal_005,0,negative,normal_005,8766
...,...,...,...,...,...
263,tumor_107,1,micro,tumor_107,196
264,tumor_108,1,macro,tumor_108,1530
265,tumor_109,1,macro,tumor_109,2482
266,tumor_110,1,macro,tumor_110,22605


In [85]:
train_df.to_csv("./dataset_csv/train_data.csv", index=False)
test_df.to_csv("./dataset_csv/test_data.csv", index=False)

In [97]:
import os
from PIL import Image
import numpy as np
import pandas as pd
import math
from tqdm import tqdm
import openslide
import h5py
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models, transforms

from torcheval.metrics import MulticlassAccuracy, BinaryAccuracy
from sklearn.metrics import accuracy_score

class PatchIDCounter:
    def __init__(self, df, shuffle=False, cap=None):
        self.df = df
        self.shuffle = shuffle
        self.cap = cap
        self.mapping = self._compute_mapping()

    def _compute_mapping(self):
        np.random.seed(7)
        mapping = []
        for _, row in self.df.iterrows():
            slide_id = row['slide_nb']
            nb_of_patches = row['nb_patches']
            
            # Determine the patch indices we will use
            if self.cap and nb_of_patches > self.cap:
                selected_patch_indices = np.random.choice(nb_of_patches, self.cap, replace=False)
            else:
                selected_patch_indices = range(nb_of_patches)
            
            for patch_id in selected_patch_indices:
                mapping.append((slide_id, patch_id))

        if self.shuffle:
            np.random.shuffle(mapping)
        return mapping

    def next_id(self, idx):
        return self.mapping[idx]


class PatchDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, h5_dir, df, label_dict, image_size, patch_size, cap=100, augmentation=False, training=False, ext=".tif"):
        if augmentation and training:  
            self.transform = transforms.Compose([
                transforms.Resize((image_size, image_size)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                transforms.RandomHorizontalFlip(),
                transforms.RandomVerticalFlip(),
                transforms.RandomRotation(30),
                transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((image_size, image_size)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ])

        self.data_dir = data_dir
        self.h5_dir = h5_dir
        self.df = df
        self.label_dict = label_dict
        self.ext = ext
        self.patch_size = (patch_size, patch_size)
        
        self.counter = PatchIDCounter(df, shuffle=training, cap=cap)

    def __len__(self):
        return len(self.counter.mapping)
        # return 10
    def __getitem__(self, idx):
        slide_id, patch_id = self.counter.next_id(idx)
        return slide_id, patch_id
        # label = self.df["label"][self.df["slide_id"] == slide_id].item()
        
        # wsi = openslide.open_slide(os.path.join(self.data_dir, slide_id + self.ext))
        # with h5py.File(os.path.join(self.h5_dir, slide_id + ".h5"), "r") as hf:
        #     c = np.array(hf["coords"])[patch_id]
        # patch = wsi.read_region(c, 0, self.patch_size).convert("RGB")
        # return self.transform(patch), np.array(self.label_dict[label])[np.newaxis]

In [96]:
np.random.seed(7)
print(np.random.choice(range(100)))
np.random.permutation(range(10))

47


array([8, 5, 0, 2, 1, 9, 7, 3, 6, 4])

In [91]:
np.random.choice(range(100))

25

In [100]:
def main(
        data_dir="/media/nfs/LN/CAMELYON16/images/",
        h5_dir="/media/nfs/LN/CAMELYON16/patches/",
        training_csv="./dataset_csv/train_data.csv",
        testing_csv="./dataset_csv/test_data.csv",
        run_name=None,
        backbone="resnet50",
        finetuning="mid",
        optimizer="Adam",
        lr=1e-3,
        l2_reg=1e-5,
        earlystopping=20,
        image_size=224,
        ext=".tif",
        patch_size=224,
        max_patches=100,
        augmentation=True,
        batch_size=128,
        val_batch_size=64,
        cross_validation=5,
        epochs=100,
        load_from=None,
        seed=7,
        multi_gpus=True,
        wandb=False
    ):
    config = locals()
    parameter_dict = {
			'optimizer': {
				"values": ['Adam', 'SGD']
			},
			'backbone': {
				"values": ["resnet50", "densenet", "mobilenet", "vit", "ctp"]
			},
            'finetuning': {
				"values": [None, "deep", "mid", "shallow"]
			},
            'image_size': {
				"values": [224, 512]
			},
			'lr': {
				'distribution': 'uniform',
				'min': 2e-4,
				'max': 1e-1
			},
			'l2_reg': {
				'distribution': 'uniform',
				'min': 1e-5,
				'max': 1e-3
			}
		}
    initial_config = {k: {"value": v} for k, v in config.items() if k not in parameter_dict.keys()}
    parameter_dict.update(initial_config)
    return parameter_dict

main()

{'optimizer': {'values': ['Adam', 'SGD']},
 'backbone': {'values': ['resnet50', 'densenet', 'mobilenet', 'vit', 'ctp']},
 'finetuning': {'values': [None, 'deep', 'mid', 'shallow']},
 'image_size': {'values': [224, 512]},
 'lr': {'distribution': 'uniform', 'min': 0.0002, 'max': 0.1},
 'l2_reg': {'distribution': 'uniform', 'min': 1e-05, 'max': 0.001},
 'data_dir': {'value': '/media/nfs/LN/CAMELYON16/images/'},
 'h5_dir': {'value': '/media/nfs/LN/CAMELYON16/patches/'},
 'training_csv': {'value': './dataset_csv/train_data.csv'},
 'testing_csv': {'value': './dataset_csv/test_data.csv'},
 'run_name': {'value': None},
 'earlystopping': {'value': 20},
 'ext': {'value': '.tif'},
 'patch_size': {'value': 224},
 'max_patches': {'value': 100},
 'augmentation': {'value': True},
 'batch_size': {'value': 128},
 'val_batch_size': {'value': 64},
 'cross_validation': {'value': 5},
 'epochs': {'value': 100},
 'load_from': {'value': None},
 'seed': {'value': 7},
 'multi_gpus': {'value': True},
 'wandb': {