# HIPT TCGA BRCA Dataset Splitting

In [None]:
import os
import pandas as pd
from pathlib import Path

pd.set_option('mode.chained_assignment', None)

In [None]:
os.chdir('/data/pathology/projects/ais-cap/code/git/clemsgrs/hipt')

### Create label dataframe

get HIPT brca slide ids

In [None]:
hipt_dir = Path('/data/pathology/projects/ais-cap/code/git/opensource/HIPT')

In [None]:
fold_0_fp = Path(hipt_dir, '2-Weakly-Supervised-Subtyping/splits/10foldcv_subtype/tcga_brca/splits_0.csv')
df = pd.read_csv(fold_0_fp, index_col=0)

In [None]:
hipt_train_brca = list(df['train'].dropna().unique())
hipt_tune_brca = list(df['val'].dropna().unique())
hipt_test_brca = list(df['test'].dropna().unique())

In [None]:
hipt_train_brca = set(hipt_train_brca)
hipt_tune_brca = set(hipt_tune_brca)
hipt_test_brca = set(hipt_test_brca)

In [None]:
hipt_brca = hipt_train_brca | hipt_tune_brca | hipt_test_brca
len(hipt_brca)

In [None]:
hipt_labels_csv_path = Path(hipt_dir, '2-Weakly-Supervised-Subtyping/dataset_csv/tcga_brca_subset.csv.zip')
hipt_labels_df = pd.read_csvhipt_labels_csv_path(, index_col=0)
hipt_labels_df['slide_id'] = hipt_labels_df['slide_id'].apply(lambda x: Path(x).stem)

In [None]:
hipt_fold_labels_df = hipt_labels_df[hipt_labels_df['slide_id'].isin(hipt_brca)]
hipt_fold_labels_df.slide_id.dropna().nunique()

In [None]:
hipt_fold_labels_df.oncotree_code.value_counts(dropna=False)

In [None]:
def map_otc_to_int(otc: str, missing_label: int = -1):
    if otc == 'IDC':
        return 0
    elif otc == 'ILC':
        return 1
    else:
        return missing_label

In [None]:
hipt_fold_labels_df['label'] = hipt_fold_labels_df['oncotree_code'].apply(map_otc_to_int)

In [None]:
hipt_fold_labels_df.train.value_counts(dropna=False)

In [None]:
len(label_df)

In [None]:
cols_to_drop = ['train']
label_df = hipt_fold_labels_df.drop(columns=cols_to_drop)
label_df.to_csv('data/tcga_brca/labels.csv', index=False)

### Load HIPT slides

In [None]:
nfold = 10
cols = ['case_id', 'slide_id', 'label']
for i in range(nfold):
    fold_fp = Path(hipt_dir, f'2-Weakly-Supervised-Subtyping/splits/10foldcv_subtype/tcga_brca/splits_{i}.csv')
    df = pd.read_csv(fold_fp, index_col=0)
    # retrieve train/tune/test slide ids
    train = [s for s in list(df['train'].dropna().unique()) if s in label_df.slide_id.unique()]
    tune = [s for s in list(df['val'].dropna().unique()) if s in label_df.slide_id.unique()]
    test = [s for s in list(df['test'].dropna().unique()) if s in label_df.slide_id.unique()]
    # create train/tune/test label dataframes
    train_df = label_df[label_df['slide_id'].isin(train)][cols]
    tune_df = label_df[label_df['slide_id'].isin(tune)][cols]
    test_df = label_df[label_df['slide_id'].isin(test)][cols]
    # save train/tune/test label df to disk
    split_dir = Path(f'data/tcga_brca/splits/fold_{i}')
    split_dir.mkdir(parents=True, exist_ok=True)
    train_df.to_csv(Path(split_dir, f'train.csv'), index=False)
    tune_df.to_csv(Path(split_dir, f'tune.csv'), index=False)
    test_df.to_csv(Path(split_dir, f'test.csv'), index=False)