In [32]:
import pandas as pd
from pathlib import Path
import re

COHORT = "TCGA-BRCA"
METADATA_DIR = Path("/metadata")

TCGA_PATIENT_ID_PATTERN = r"^[\w]+:(\w+-\w+-\w+)-\d+$"
CPTAC_PATIENT_ID_PATTERN = r"^[\w]+:[A-Z]+(\d[\w]*)$"
PATIENT_ID_PATTERN = re.compile(
    TCGA_PATIENT_ID_PATTERN if COHORT.upper().startswith("TCGA") else CPTAC_PATIENT_ID_PATTERN
)

if COHORT == "TCGA-BRCA":
    CLINICAL_TARGETS = {"Subtype": "subtype"}
    REPLACE_CATEGORICAL_VALUES = {
        "subtype": {
            "BRCA_LumA": "LumA",
            "BRCA_LumB": "LumB",
            "BRCA_Her2": "Her2",
            "BRCA_Basal": "Basal",
            "BRCA_Normal": "Normal",
        }
    }
elif COHORT == "CPTAC-BRCA":
    CLINICAL_TARGETS = {"PAM50": "subtype"}
    REPLACE_CATEGORICAL_VALUES = {
        "subtype": {
            "LumA": "LumA",
            "LumB": "LumB",
            "Her2": "Her2",
            "Basal": "Basal",
            "Normal": "Normal-like",
        }
    }

sample_matrix_file = METADATA_DIR / f"{COHORT}_sample_matrix.tsv"
clinical_file = METADATA_DIR / f"{COHORT}_clinical.tsv"
clini_table = METADATA_DIR / f"{COHORT}_CLINI.csv"  # output file

In [33]:
dfc = pd.read_csv(clinical_file, sep="\t")
dfc["PATIENT"] = dfc["Patient ID"]
dfc.set_index("PATIENT", inplace=True)
dfc.rename(columns=CLINICAL_TARGETS, inplace=True)
dfc = dfc[list(CLINICAL_TARGETS.values())]
# dfc

In [34]:
df = pd.read_csv(sample_matrix_file, sep="\t")

id_col = "studyID:sampleId"
pattern = re.compile(r"^[\w]+:(\w+-\w+-\w+)-\d+$")

patient_id = df[id_col].apply(lambda x: pattern.match(x).group(1))
df["PATIENT"] = patient_id

# Check if there are duplicate values for a feature for a patient
dup = df.drop(columns=[id_col]).groupby("PATIENT").nunique() != 1
dup = dup.index[dup.any(axis=1)]

if len(dup) > 0:
    print(f"Found {len(dup)} patients with multiple values for a feature:")
    print(df[df["PATIENT"].isin(dup)])
    print("Dropping these patients.")
    df = df[~df["PATIENT"].isin(dup)]

df.drop(columns=[id_col, "Altered"], inplace=True)
df.set_index("PATIENT", inplace=True)
# df

In [35]:
df = df.merge(dfc, left_index=True, right_index=True, how="outer")
for col, mapping in REPLACE_CATEGORICAL_VALUES.items():
    df[col] = df[col].replace(mapping)

In [36]:
df.to_csv(clini_table)

In [37]:
df

Unnamed: 0_level_0,CDH1,TP53,PIK3CA,subtype
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-3C-AAAU,0,0,0,LumA
TCGA-3C-AALI,0,1,0,Her2
TCGA-3C-AALJ,0,0,0,LumB
TCGA-3C-AALK,0,0,1,LumA
TCGA-4H-AAAK,1,0,0,LumA
...,...,...,...,...
TCGA-WT-AB44,1,0,0,LumA
TCGA-XX-A899,1,0,1,LumA
TCGA-XX-A89A,1,0,1,LumA
TCGA-Z7-A8R5,1,0,1,LumA
