In [26]:
import pandas as pd
from pathlib import Path
from loguru import logger

COHORT = "CAMELYON17"
METADATA_DIR = Path("/metadata")

clinical_file = METADATA_DIR / f"{COHORT}_clinical.csv"

In [34]:
dfc = pd.read_csv(clinical_file)
dfc = dfc[dfc.patient.str.endswith(".tif")]
dfc.patient = dfc.patient.str.replace(".tif", "", regex=False)
dfc["lymph"] = dfc["stage"].map({"micro": "positive", "macro": "positive", "itc": "positive", "negative": "negative"})
dfc["center"] = dfc.patient.str.extract(r"patient_(\d+)_node_\d+").astype(int) // 20
dfc = dfc.rename(columns={"patient": "PATIENT", "stage": "lymph_status"})
dfc = dfc.set_index("PATIENT")
dfc

Unnamed: 0_level_0,lymph_status,lymph,center
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
patient_000_node_0,negative,negative,0
patient_000_node_1,negative,negative,0
patient_000_node_2,negative,negative,0
patient_000_node_3,negative,negative,0
patient_000_node_4,negative,negative,0
...,...,...,...
patient_099_node_0,negative,negative,4
patient_099_node_1,negative,negative,4
patient_099_node_2,negative,negative,4
patient_099_node_3,negative,negative,4


In [35]:
for center in dfc.center.unique():
    train_df = dfc[dfc.center != center]
    test_df = dfc[dfc.center == center]

    train_df.to_csv(METADATA_DIR / f"{COHORT}_{center}_train_CLINI.csv")
    test_df.to_csv(METADATA_DIR / f"{COHORT}_{center}_test_CLINI.csv")
    logger.info(f"Saved {len(train_df)} train and {len(test_df)} test patients for center {center}")

[32m2023-11-04 20:28:27.223[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mSaved 400 train and 100 test patients for center 0[0m
[32m2023-11-04 20:28:27.239[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mSaved 400 train and 100 test patients for center 1[0m
[32m2023-11-04 20:28:27.253[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mSaved 400 train and 100 test patients for center 2[0m
[32m2023-11-04 20:28:27.265[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mSaved 400 train and 100 test patients for center 3[0m
[32m2023-11-04 20:28:27.277[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mSaved 400 train and 100 test patients for center 4[0m
