In [5]:
import pandas as pd
from pathlib import Path
import re

COHORT = "TCGA-BRCA"
COHORT = "TCGA-CRC"
COHORT = "CPTAC-BRCA"
COHORT = "CPTAC-CRC"
COHORT = "CAMELYON17"

METADATA_DIR = Path("/metadata")
DATA_DIR = Path("/data")

if "TCGA" in COHORT:
    PATTERN = r"^(\w+-\w+-\w+)-.*$"
elif "CPTAC" in COHORT:
    PATTERN = r"^(\w+)-.*$"
elif "CAMELYON" in COHORT:
    PATTERN = r"^(patient_\d+_node_\d).*$"

features_dir = DATA_DIR / "histaug" / "features" / COHORT / "ctranspath"
slide_file = METADATA_DIR / f"{COHORT}_SLIDE.csv"  # output file

In [6]:
def find_zarrs(path: Path):
    # Avoids having to look inside .zarr directories
    if path.suffix == ".zarr":
        yield path
    else:
        for x in path.iterdir():
            if x.is_dir():
                yield from find_zarrs(x)


files = sorted(x.relative_to(features_dir) for x in find_zarrs(features_dir))

In [7]:
pattern = re.compile(PATTERN)

patient_id = [pattern.match(x.name).group(1) for x in files]

df = pd.DataFrame({"PATIENT": patient_id, "FILENAME": files}).set_index("PATIENT")
df.to_csv(slide_file, index=True)