In [23]:
import pandas as pd
from pathlib import Path
import re

COHORT = "TCGA-BRCA"
COHORT = "CPTAC-BRCA"
COHORT = "TCGA-CRC"
COHORT = "CPTAC-CRC"
METADATA_DIR = Path("/metadata")

TCGA_PATIENT_ID_PATTERN = r"^[\w]+:(\w+-\w+-\w+)-\d+$"
CPTAC_PATIENT_ID_PATTERN = r"^[\w]+:[A-Z]*(\d[\w]*)$"
SAMPLE_MATRIX_PATIENT_ID_PATTERN = re.compile(
    TCGA_PATIENT_ID_PATTERN if COHORT.upper().startswith("TCGA") else CPTAC_PATIENT_ID_PATTERN
)
CLINICAL_PATIENT_ID_PATTERN = re.compile(r"(.+)" if COHORT.upper().startswith("TCGA") else r"[A-Z]*(\d[\w]*)")


if COHORT == "TCGA-BRCA":
    CLINICAL_TARGETS = {"Subtype": "subtype"}
    REPLACE_CATEGORICAL_VALUES = {
        "subtype": {
            "BRCA_LumA": "LumA",
            "BRCA_LumB": "LumB",
            "BRCA_Her2": "Her2",
            "BRCA_Basal": "Basal",
            "BRCA_Normal": "Normal",
        }
    }
elif COHORT == "CPTAC-BRCA":
    CLINICAL_TARGETS = {"PAM50": "subtype"}
    REPLACE_CATEGORICAL_VALUES = {
        "subtype": {
            "LumA": "LumA",
            "LumB": "LumB",
            "Her2": "Her2",
            "Basal": "Basal",
            "Normal-like": "Normal",
        }
    }
elif COHORT == "TCGA-CRC":
    CLINICAL_TARGETS = {"MSI Status": "MSI"}
    REPLACE_CATEGORICAL_VALUES = {"MSI": {"MSI-H": "MSI", "MSI-L": "MSS", "MSS": "MSS", "NA": None}}
elif COHORT == "CPTAC-CRC":
    CLINICAL_TARGETS = {"MSI Status": "MSI"}
    REPLACE_CATEGORICAL_VALUES = {
        "MSI": {
            "MSI-H": "MSI",
            "MSS": "MSS",
        }
    }


sample_matrix_file = METADATA_DIR / f"{COHORT}_sample_matrix.tsv"
clinical_file = METADATA_DIR / f"{COHORT}_clinical.tsv"
clini_table = METADATA_DIR / f"{COHORT}_CLINI.csv"  # output file

In [24]:
dfc = pd.read_csv(clinical_file, sep="\t")
dfc["PATIENT"] = dfc["Patient ID"].apply(lambda x: CLINICAL_PATIENT_ID_PATTERN.match(x).group(1))
dfc.set_index("PATIENT", inplace=True)
dfc.rename(columns=CLINICAL_TARGETS, inplace=True)
dfc = dfc[list(CLINICAL_TARGETS.values())]
dfc

Unnamed: 0_level_0,MSI
PATIENT,Unnamed: 1_level_1
01CO001,MSS
01CO005,MSS
01CO006,MSS
01CO008,MSS
01CO013,MSS
...,...
21CO007,MSS
22CO004,MSS
22CO006,MSI-H
24CO005,MSS


In [25]:
df = pd.read_csv(sample_matrix_file, sep="\t")

id_col = "studyID:sampleId"

patient_id = df[id_col].apply(lambda x: SAMPLE_MATRIX_PATIENT_ID_PATTERN.match(x).group(1))
df["PATIENT"] = patient_id

# Check if there are duplicate values for a feature for a patient
dup = df.drop(columns=[id_col]).groupby("PATIENT").nunique() != 1
dup = dup.index[dup.any(axis=1)]

if len(dup) > 0:
    print(f"Found {len(dup)} patients with multiple values for a feature:")
    print(df[df["PATIENT"].isin(dup)])
    print("Dropping these patients.")
    df = df[~df["PATIENT"].isin(dup)]

df.drop(columns=[id_col, "Altered"], inplace=True)
df.set_index("PATIENT", inplace=True)
df

Unnamed: 0_level_0,BRAF,KRAS,SMAD4
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01CO001,0,0,1
01CO005,0,0,0
01CO006,0,0,0
01CO008,0,1,0
01CO013,0,0,1
...,...,...,...
05CO014,0,0,0
05CO055,0,0,1
11CO059,0,0,0
16CO012,0,1,0


In [26]:
print(
    "Unmatched:", len(set(dfc.index.values) - set(df.index.values)), len(set(df.index.values) - set(dfc.index.values))
)

Unmatched: 0 0


In [27]:
df = df.merge(dfc, left_index=True, right_index=True, how="outer")
for col, mapping in REPLACE_CATEGORICAL_VALUES.items():
    if unknown := set(df[col].unique()) ^ set(mapping.keys()):
        print(f"WARNING: unmapped values for {col}: {unknown}")
    df[col] = df[col].replace(mapping)



In [28]:
df.to_csv(clini_table)

In [29]:
df

Unnamed: 0_level_0,BRAF,KRAS,SMAD4,MSI
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01CO001,0,0,1,MSS
01CO005,0,0,0,MSS
01CO006,0,0,0,MSS
01CO008,0,1,0,MSS
01CO013,0,0,1,MSS
...,...,...,...,...
21CO007,0,1,0,MSS
22CO004,0,0,0,MSS
22CO006,1,0,0,MSI
24CO005,0,0,0,MSS
