Breast cancer stage prediction from pathological whole slide images with hierarchical image pyramid transformers.
Project developed under the "High Risk Breast Cancer Prediction Contest Phase 2" 
by Nightingale, Association for Health Learning & Inference (AHLI)
and Providence St. Joseph Health

Copyright (C) 2023 Zsolt Bedohazi, Andras Biricz, Istvan Csabai

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import random

In [None]:
# 10846 slides - train
# 14466 slides - holdout 

### get biopsy_df for the training data 

In [None]:
# goal: create slide_id : cancer stage mapping
slide_biop_df = pd.read_csv(
    "/home/ngsci/datasets/brca-psj-path/contest-phase-2/slide-manifest-train.csv"
)
outcomes_df = pd.read_csv("/home/ngsci/datasets/brca-psj-path/contest-phase-2/csv-train/outcomes.csv")
slide_stage_df = slide_biop_df.merge(outcomes_df, on="biopsy_id")

# map cancer stage to 0 - 4:
# outcomes_df["stage"].unique()
#     ['IA', 'IIB', 'IIA', '0', nan, 'IIIC', 'IV', 'IIIA', 'IIIB', 'IB']
def stage_to_int(stage):
    if stage == "0":
        return 0
    elif stage == "IA" or stage == "IB":
        return 1
    elif stage == "IIA" or stage == "IIB":
        return 2
    elif stage == "IIIA" or stage == "IIIB" or stage == "IIIC":
        return 3
    elif stage == "IV":
        return 4
    else:
        return np.nan


slide_stage_df["stage"] = slide_stage_df["stage"].apply(stage_to_int)

# subset columns, drop nans, reset index
labels_df = (
    slide_stage_df[["slide_id", "biopsy_id", "stage"]]
    .copy()
    .dropna(how="any")
    .reset_index(drop=True)
)
labels_df["stage"] = labels_df["stage"].astype(int)

sort_idx = np.argsort( labels_df.slide_id.values )
labels_df = labels_df.loc[sort_idx]
labels_df.reset_index(inplace=True, drop=True)

labels_df.head(5)

In [None]:
labels_df.shape

In [None]:
biopsy_df = labels_df.groupby(["biopsy_id"], sort=False).agg({'stage': lambda x: x.tolist()[0]}).reset_index()
biopsy_df.head()

In [None]:
biopsy_df.shape

In [None]:
Counter(biopsy_df['stage'])

In [None]:
plt.hist(biopsy_df['stage'])

### Generate local test set (10%)

In [None]:
n_splits = 8

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=87)

In [None]:
data_idx = biopsy_df.index.values

train_splits = []
val_splits = []

for train_idx, val_idx in skf.split(data_idx, biopsy_df.iloc[data_idx]['stage']):
    
    train_splits.append(train_idx)
    val_splits.append(val_idx)
    
    break

In [None]:
len(train_splits[0]), len(val_splits[0])

In [None]:
print(np.unique(biopsy_df.iloc[val_splits[0]]['stage'], return_counts=True))

In [None]:
val_splits[0]

In [None]:
biopsy_df_test = biopsy_df.iloc[val_splits[0]]
biopsy_df_test

### save test set

In [None]:

save_dir = 'cv_splits_stratified_with_test_set_10fold/'
os.makedirs(save_dir, exist_ok=True)

biopsy_df_test.to_csv(f'{save_dir}test_split_stratified.csv', index=False)

### Generate cv split indices - stratified

In [None]:
biopsy_df.iloc[ ~np.in1d(biopsy_df.index.values, val_splits[0]) ]

In [None]:
biopsy_df_rest = biopsy_df.iloc[ ~np.in1d(biopsy_df.index.values, val_splits[0]) ].reset_index(drop=True)

In [None]:
biopsy_df_rest

In [None]:
n_splits = 10

In [None]:
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=87)

In [None]:
skf

In [None]:
data_idx = biopsy_df_rest.index.values

train_splits = []
val_splits = []

for train_idx, val_idx in skf.split(data_idx, biopsy_df_rest.iloc[data_idx]['stage']):
    
    train_splits.append(train_idx)
    val_splits.append(val_idx)

In [None]:
len(train_splits[0]), len(val_splits[0])

In [None]:
for i in range(n_splits):
    print(np.unique(biopsy_df_rest.iloc[train_splits[i]]['stage'], return_counts=True))

In [None]:
144/800, 312/800, 160/800, 136/800, 48/800

In [None]:
for i in range(n_splits):
    print(np.unique(biopsy_df_rest.iloc[val_splits[i]]['stage'], return_counts=True))

In [None]:
36/200, 78/200, 40/200, 34/200, 12/200

In [None]:
plt.hist(biopsy_df_rest.iloc[train_splits[0]]['stage'])

In [None]:
# check if there is any overlap in the val sets
for i in range(n_splits-1):
    print(list(set(val_splits[0]) & set(val_splits[i+1])))

### save train and val splits - stratified

In [None]:
save_dir = 'cv_splits_stratified_with_test_set_10fold/'
os.makedirs(save_dir, exist_ok=True)

for s in range(n_splits):
    # save train set
    biopsy_df_rest.iloc[train_splits[s]].to_csv(f'{save_dir}train_split_stratified_{s}.csv', index=False)
    
    # save val set
    biopsy_df_rest.iloc[val_splits[s]].to_csv(f'{save_dir}val_split_stratified_{s}.csv', index=False)