# Imports and Parameters

In [1]:
import os
import shutil

import numpy as np
import pandas as pd

import torch
from torchvision import transforms
from PIL import Image
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split

In [None]:
# !pip install kaggle


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
!mkdir -p ~/.kaggle

In [5]:
mv ~/Documents/API_Kaggle/kaggle.json ~/.kaggle/


mv: /Users/giangluong/Documents/API_Kaggle/kaggle.json: No such file or directory


In [7]:
mkdir -p ./data/original_data

In [None]:
# !kaggle competitions download -c histopathologic-cancer-detection -p ./data/original_data
# import zipfile
# import os

# data_path = './data/original_data'
# zip_files = [f for f in os.listdir(data_path) if f.endswith('.zip')]

# for zip_file in zip_files:
#     with zipfile.ZipFile(os.path.join(data_path, zip_file), 'r') as zip_ref:
#         zip_ref.extractall(data_path)



In [9]:
ORIGINAL_DATA_FOLDER = r"./data/original_data"
PREPARED_DATA_FOLDER = r"./data/prepared_data"

DO_SPLIT_VAL = False
SAMPLE_PROPORTION = 0.1
TEST_SPLIT = 0.2  # also used for val if DO_SPLIT_VAL is True

# Transformation parameters as constants
HORIZONTAL_FLIP_PROB = 0.5
VERTICAL_FLIP_PROB = 0.0
ROTATION_DEGREES = 15
COLORJITTER_BRIGHTNESS = 0.15
COLORJITTER_CONTRAST = 0.15
COLORJITTER_SATURATION = 0.10
COLORJITTER_HUE = 0.02
GAUSSIANBLUR_KERNEL_SIZE = 3
GAUSSIANBLUR_SIGMA = (0.1, 1.5)

# Load and sample

In [10]:
labels_df = pd.read_csv(os.path.join(ORIGINAL_DATA_FOLDER, "train_labels.csv"))
labels_df

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0
...,...,...
220020,53e9aa9d46e720bf3c6a7528d1fca3ba6e2e49f6,0
220021,d4b854fe38b07fe2831ad73892b3cec877689576,1
220022,3d046cead1a2a5cbe00b2b4847cfb7ba7cf5fe75,0
220023,f129691c13433f66e1e0671ff1fe80944816f5a2,0


In [11]:
labels_df.shape


(220025, 2)

In [12]:
sample_size = int(len(labels_df) * SAMPLE_PROPORTION)
sample_size

22002

In [13]:
labels_df["label"].value_counts()


label
0    130908
1     89117
Name: count, dtype: int64

In [14]:
# Perform stratified sampling to keep the same ratio of 0 and 1 as the original data
label_counts = labels_df["label"].value_counts(normalize=True)
n_0 = int(round(sample_size * label_counts[0]))
n_1 = sample_size - n_0  # ensure total is exactly SAMPLE_SIZE

stratified_sample_0 = labels_df[labels_df["label"] == 0].sample(n=n_0, random_state=0)
stratified_sample_1 = labels_df[labels_df["label"] == 1].sample(n=n_1, random_state=0)

stratified_sample = pd.concat(
    [stratified_sample_0, stratified_sample_1], ignore_index=True
)
stratified_sample = stratified_sample.sample(frac=1, random_state=0).reset_index(
    drop=True
)  # shuffle

stratified_sample


Unnamed: 0,id,label
0,8a0657ec177b9eb56818104dd2b47506c6a39527,1
1,afab792594de5e1d307921ce71056ae20c87fa63,0
2,4abdc97be8cfb42c396f85b7efb06f7d444d8166,0
3,033fdf44b55943ee73b8b40e94b789033463b136,1
4,0cbd532db8b2bd65e8dab08250d157babc832f4d,0
...,...,...
21997,681768428a6cad7eafcb31581464bab65784321a,1
21998,ecf900769a47084b571213ad9d01e5bd330c9e4e,1
21999,3adca6cf48552e5b31e9ea3f942c91ef874ab030,0
22000,cd5b2fa479d71e14265b554ff7aec534a0b21f0c,0


In [15]:
stratified_sample["label"].value_counts()


label
0    13091
1     8911
Name: count, dtype: int64

In [16]:
original_data_dir = os.path.join(ORIGINAL_DATA_FOLDER, "train")  # directory of files
prepared_data_dir = os.path.join(PREPARED_DATA_FOLDER, "full_data")

# Create the destination directory if it doesn't exist
os.makedirs(prepared_data_dir, exist_ok=True)

file_names = [f"{img_id}.tif" for img_id in stratified_sample["id"]]
print(len(file_names))
full_filenames = [
    os.path.join(original_data_dir, f) for f in file_names
]  # get the full path to images

# Copy files to the prepared_data_dir
for src_path in full_filenames:
    dst_path = os.path.join(prepared_data_dir, os.path.basename(src_path))
    shutil.copy2(src_path, dst_path)


22002


In [17]:
stratified_sample.to_csv(
    os.path.join(PREPARED_DATA_FOLDER, "full_data_labels.csv"), index=False
)

# Train, val, test split

In [19]:
torch.manual_seed(0)  # fix random seed


class pytorch_data(Dataset):
    def __init__(self, data_dir, transform, data_type="train"):
        # Get Image File Names
        cdm_data = os.path.join(data_dir, data_type)  # directory of files

        file_names = [f for f in os.listdir(cdm_data) if f != ".gitkeep"]

        self.full_filenames = [
            os.path.join(cdm_data, f) for f in file_names
        ]  # get the full path to images

        # Get Labels
        labels_data = os.path.join(data_dir, f"{data_type}_labels.csv")
        labels_df = pd.read_csv(labels_data)
        labels_df.set_index("id", inplace=True)  # set data frame index to id

        self.labels = []
        for filename in file_names:
            try:
                label = labels_df.loc[filename[:-4]].values[0]
                self.labels.append(label)
            except Exception as e:
                print(f"Problem with filename: {filename} - {e}")
                raise
        self.transform = transform

    def __len__(self):
        return len(self.full_filenames)  # size of dataset

    def __getitem__(self, idx):
        # open image, apply transforms and return with label
        image = Image.open(self.full_filenames[idx])  # Open Image with PIL
        image = self.transform(image)  # Apply Specific Transformation to Image
        return image, self.labels[idx]


In [20]:
# define transformation that converts a PIL image into PyTorch tensors
from torchvision import transforms

data_transformer = transforms.Compose([transforms.ToTensor()])

In [22]:
# Define an object of the custom dataset for the train folder.
img_dataset = pytorch_data(
    PREPARED_DATA_FOLDER, data_transformer, "full_data"
)  # Histopathalogic images

In [23]:
# Get all labels from the dataset
all_labels = np.array(img_dataset.labels)
all_indices = np.arange(len(img_dataset))

# First, split into train and test in a stratified way
train_indices, test_indices = train_test_split(
    all_indices, test_size=TEST_SPLIT, stratify=all_labels, random_state=42
)

if DO_SPLIT_VAL:
    # Further split train into train/val in a stratified way
    train_labels = all_labels[train_indices]
    train_indices, val_indices = train_test_split(
        train_indices,
        test_size=TEST_SPLIT / (1 - TEST_SPLIT),
        stratify=train_labels,
        random_state=42,
    )
    val_ts = torch.utils.data.Subset(img_dataset, val_indices)

train_ts = torch.utils.data.Subset(img_dataset, train_indices)
test_ts = torch.utils.data.Subset(img_dataset, test_indices)

# Save the labels to respective CSV files in PREPARED_DATA_FOLDER
# Get the corresponding filenames for each split
all_filenames = np.array(img_dataset.full_filenames)


def get_id_from_path(path):
    # Assumes filename is the last part and ends with .tif
    return os.path.splitext(os.path.basename(path))[0]


# Ensure the PREPARED_DATA_FOLDER exists
os.makedirs(PREPARED_DATA_FOLDER, exist_ok=True)

# Train labels
train_ids = [get_id_from_path(all_filenames[i]) for i in train_indices]
train_labels_arr = all_labels[train_indices]
train_df = pd.DataFrame({"id": train_ids, "label": train_labels_arr})
train_df.to_csv(os.path.join(PREPARED_DATA_FOLDER, "train_labels.csv"), index=False)

# Validation labels (if applicable)
if DO_SPLIT_VAL:
    val_ids = [get_id_from_path(all_filenames[i]) for i in val_indices]
    val_labels_arr = all_labels[val_indices]
    val_df = pd.DataFrame({"id": val_ids, "label": val_labels_arr})
    val_df.to_csv(os.path.join(PREPARED_DATA_FOLDER, "val_labels.csv"), index=False)

# Test labels
test_ids = [get_id_from_path(all_filenames[i]) for i in test_indices]
test_labels_arr = all_labels[test_indices]
test_df = pd.DataFrame({"id": test_ids, "label": test_labels_arr})
test_df.to_csv(os.path.join(PREPARED_DATA_FOLDER, "test_labels.csv"), index=False)

print("train dataset size:", len(train_ts))
if DO_SPLIT_VAL:
    print("validation dataset size:", len(val_ts))
print("test dataset size:", len(test_ts))

train dataset size: 17601
test dataset size: 4401


In [24]:
# getting the torch tensor image & target variable
ii = -1
for x, y in train_ts:
    print(x.shape, y)
    ii += 1
    if ii > 5:
        break

torch.Size([3, 96, 96]) 1
torch.Size([3, 96, 96]) 1
torch.Size([3, 96, 96]) 1
torch.Size([3, 96, 96]) 0
torch.Size([3, 96, 96]) 0
torch.Size([3, 96, 96]) 0
torch.Size([3, 96, 96]) 1


# Transform and save

In [25]:
# Define the transformations for the training dataset, with explanations for each step
tr_transf = transforms.Compose(
    [
        # Randomly flip the image horizontally with a probability to augment data and help the model generalize
        transforms.RandomHorizontalFlip(p=HORIZONTAL_FLIP_PROB),
        # Randomly flip the image vertically with a probability (disabled here).
        # Set to 0.5 only if the label is invariant to vertical flips.
        transforms.RandomVerticalFlip(p=VERTICAL_FLIP_PROB),
        # Randomly rotate the image within a range of ±ROTATION_DEGREES degrees to introduce rotational invariance.
        # The output image keeps its original size (no expand).
        transforms.RandomRotation(degrees=ROTATION_DEGREES),
        # Randomly change the brightness, contrast, saturation, and hue of the image to simulate different lighting conditions.
        transforms.ColorJitter(
            brightness=COLORJITTER_BRIGHTNESS,  # Adjust brightness by ±15%
            contrast=COLORJITTER_CONTRAST,  # Adjust contrast by ±15%
            saturation=COLORJITTER_SATURATION,  # Adjust saturation by ±10%
            hue=COLORJITTER_HUE,  # Adjust hue by ±2%
        ),
        # Apply Gaussian blur with a kernel size and a sigma randomly chosen in the range.
        # This helps the model become robust to slight blurring in images.
        transforms.GaussianBlur(
            kernel_size=GAUSSIANBLUR_KERNEL_SIZE, sigma=GAUSSIANBLUR_SIGMA
        ),
        # Convert the PIL Image or numpy.ndarray to a PyTorch tensor.
        # Note: Do not normalize here; normalization can be done later if needed.
        transforms.ToTensor(),
    ]
)

In [26]:
def save_dataset_split(
    dataset,
    subset_indices,
    out_dir,
    img_dataset,
    transform=None,
    apply_transform=False,
    file_format="TIFF",
):
    os.makedirs(out_dir, exist_ok=True)
    for local_idx, ds_idx in enumerate(subset_indices):
        orig_path = img_dataset.full_filenames[ds_idx]
        base_name = os.path.splitext(os.path.basename(orig_path))[0]
        save_path = os.path.join(out_dir, f"{base_name}.tif")

        img, label = dataset[local_idx]

        if isinstance(img, Image.Image):
            img_pil = img
        else:
            img_pil = transforms.ToPILImage()(img)

        if apply_transform and transform is not None:
            img_transformed = transform(img_pil)
            if not isinstance(img_transformed, Image.Image):
                img_transformed = transforms.ToPILImage()(img_transformed)
            img_to_save = img_transformed
        else:
            if not isinstance(img_pil, Image.Image):
                img_pil = transforms.ToPILImage()(img_pil)
            img_to_save = img_pil

        img_to_save.save(save_path, format=file_format)


# --- Save train set with transformation ---
out_dir = os.path.join(PREPARED_DATA_FOLDER, "train")
subset_indices = (
    train_ts.indices if hasattr(train_ts, "indices") else list(range(len(train_ts)))
)
save_dataset_split(
    train_ts,
    subset_indices,
    out_dir,
    img_dataset,
    transform=tr_transf if "tr_transf" in globals() else None,
    apply_transform=True,
    file_format="TIFF",
)

# --- Save val set without transformation ---
if DO_SPLIT_VAL:
    out_dir_val = os.path.join(PREPARED_DATA_FOLDER, "val")
    val_subset_indices = (
        val_ts.indices if hasattr(val_ts, "indices") else list(range(len(val_ts)))
    )
    save_dataset_split(
        val_ts,
        val_subset_indices,
        out_dir_val,
        img_dataset,
        transform=None,
        apply_transform=False,
        file_format="TIFF",
    )

# --- Save test set without transformation ---
out_dir_test = os.path.join(PREPARED_DATA_FOLDER, "test")
test_subset_indices = (
    test_ts.indices if hasattr(test_ts, "indices") else list(range(len(test_ts)))
)
save_dataset_split(
    test_ts,
    test_subset_indices,
    out_dir_test,
    img_dataset,
    transform=None,
    apply_transform=False,
    file_format="TIFF",
)