In [1]:
import pandas as pd
from pathlib import Path
import re

COHORT = "TCGA-BRCA"
COHORT = "CPTAC-BRCA"
COHORT = "CPTAC-CRC"
METADATA_DIR = Path("/metadata")

TCGA_PATIENT_ID_PATTERN = r"^[\w]+:(\w+-\w+-\w+)-\d+$"
CPTAC_PATIENT_ID_PATTERN = r"^[\w]+:[A-Z]+(\d[\w]*)$"
SAMPLE_MATRIX_PATIENT_ID_PATTERN = re.compile(
    TCGA_PATIENT_ID_PATTERN if COHORT.upper().startswith("TCGA") else CPTAC_PATIENT_ID_PATTERN
)
CLINICAL_PATIENT_ID_PATTERN = re.compile(r"(.+)" if COHORT.upper().startswith("TCGA") else r"[A-Z]+(\d[\w]*)")


if COHORT == "TCGA-BRCA":
    CLINICAL_TARGETS = {"Subtype": "subtype"}
    REPLACE_CATEGORICAL_VALUES = {
        "subtype": {
            "BRCA_LumA": "LumA",
            "BRCA_LumB": "LumB",
            "BRCA_Her2": "Her2",
            "BRCA_Basal": "Basal",
            "BRCA_Normal": "Normal",
        }
    }
elif COHORT == "CPTAC-BRCA":
    CLINICAL_TARGETS = {"PAM50": "subtype"}
    REPLACE_CATEGORICAL_VALUES = {
        "subtype": {
            "LumA": "LumA",
            "LumB": "LumB",
            "Her2": "Her2",
            "Basal": "Basal",
            "Normal-like": "Normal",
        }
    }
elif COHORT == "CPTAC-CRC":
    CLINICAL_TARGETS = {"MSI Status": "msi"}
    REPLACE_CATEGORICAL_VALUES = {
        "msi": {
            "MSI-H": "MSI-H",
            "MSS": "MSS",
        }
    }


sample_matrix_file = METADATA_DIR / f"{COHORT}_sample_matrix.tsv"
clinical_file = METADATA_DIR / f"{COHORT}_clinical.tsv"
clini_table = METADATA_DIR / f"{COHORT}_CLINI.csv"  # output file

In [5]:
dfc

Unnamed: 0,Study ID,Patient ID,Sample ID,Age,Cancer Type,Cancer Type Detailed,Copy Number,Disease Free (Months),Disease Free Status,Fraction Genome Altered,...,Primary Site,Protein,Number of Samples Per Patient,Sequenced,Sex,Somatic Status,Specimen Preservation Method,Cancer Stage,TMB (nonsynonymous),Tumor Site
0,coad_cptac_2019,01CO001,01CO001,61.0,Colorectal Cancer,Colon Adenocarcinoma,1,32.0,0:DiseaseFree,0.1486,...,Sigmoid Colon,0,1,1,Male,Matched,Frozen Tissue,Stage III,2.566667,Sigmoid Colon
1,coad_cptac_2019,01CO005,01CO005,70.0,Colorectal Cancer,Colon Adenocarcinoma,1,,0:DiseaseFree,0.5642,...,Sigmoid Colon,1,1,1,Female,Matched,Frozen Tissue,Stage II,4.366667,Sigmoid Colon
2,coad_cptac_2019,01CO006,01CO006,75.0,Colorectal Cancer,Colon Adenocarcinoma,1,29.0,1:Recurred/Progressed,0.0105,...,Ascending Colon,1,1,1,Female,Matched,Frozen Tissue,Stage III,2.700000,Ascending Colon
3,coad_cptac_2019,01CO008,01CO008,54.0,Colorectal Cancer,Colon Adenocarcinoma,1,,1:Recurred/Progressed,0.0515,...,Descending Colon,1,1,1,Female,Matched,Frozen Tissue,Stage II,5.166667,Descending Colon
4,coad_cptac_2019,01CO013,01CO013,58.0,Colorectal Cancer,Colon Adenocarcinoma,1,13.0,0:DiseaseFree,0.1624,...,Sigmoid Colon,1,1,1,Male,Matched,Frozen Tissue,Stage I,2.633333,Sigmoid Colon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,coad_cptac_2019,21CO007,21CO007,50.0,Colorectal Cancer,Colon Adenocarcinoma,1,,,0.4457,...,Sigmoid Colon,1,1,1,Male,Matched,Frozen Tissue,Stage IV,4.466667,Sigmoid Colon
106,coad_cptac_2019,22CO004,22CO004,52.0,Colorectal Cancer,Colon Adenocarcinoma,1,29.0,0:DiseaseFree,0.3811,...,Sigmoid Colon,1,1,1,Male,Matched,Frozen Tissue,Stage II,2.866667,Sigmoid Colon
107,coad_cptac_2019,22CO006,22CO006,57.0,Colorectal Cancer,Colon Adenocarcinoma,1,22.0,1:Recurred/Progressed,0.0055,...,Ascending Colon,1,1,1,Female,Matched,Frozen Tissue,Stage III,34.933333,Ascending Colon
108,coad_cptac_2019,24CO005,24CO005,41.0,Colorectal Cancer,Colon Adenocarcinoma,1,,,0.3783,...,Sigmoid Colon,0,1,1,Female,Matched,,Stage III,3.200000,Sigmoid Colon


In [2]:
dfc = pd.read_csv(clinical_file, sep="\t")
dfc["PATIENT"] = dfc["Patient ID"].apply(lambda x: CLINICAL_PATIENT_ID_PATTERN.match(x).group(1))
dfc.set_index("PATIENT", inplace=True)
dfc.rename(columns=CLINICAL_TARGETS, inplace=True)
dfc = dfc[list(CLINICAL_TARGETS.values())]
# dfc

AttributeError: 'NoneType' object has no attribute 'group'

In [43]:
df = pd.read_csv(sample_matrix_file, sep="\t")

id_col = "studyID:sampleId"

patient_id = df[id_col].apply(lambda x: SAMPLE_MATRIX_PATIENT_ID_PATTERN.match(x).group(1))
df["PATIENT"] = patient_id

# Check if there are duplicate values for a feature for a patient
dup = df.drop(columns=[id_col]).groupby("PATIENT").nunique() != 1
dup = dup.index[dup.any(axis=1)]

if len(dup) > 0:
    print(f"Found {len(dup)} patients with multiple values for a feature:")
    print(df[df["PATIENT"].isin(dup)])
    print("Dropping these patients.")
    df = df[~df["PATIENT"].isin(dup)]

df.drop(columns=[id_col, "Altered"], inplace=True)
df.set_index("PATIENT", inplace=True)
# df

In [44]:
print(
    "Unmatched:", len(set(dfc.index.values) - set(df.index.values)), len(set(df.index.values) - set(dfc.index.values))
)

Unmatched: 0 0


In [45]:
df = df.merge(dfc, left_index=True, right_index=True, how="outer")
for col, mapping in REPLACE_CATEGORICAL_VALUES.items():
    if unknown := set(df[col].unique()) ^ set(mapping.keys()):
        print(f"WARNING: unmapped values for {col}: {unknown}")
    df[col] = df[col].replace(mapping)

In [46]:
df.to_csv(clini_table)

In [47]:
df

Unnamed: 0_level_0,CDH1,TP53,PIK3CA,subtype
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000814,0,1,0,Basal
001846,0,1,1,Basal
01BR001,0,1,0,Basal
01BR008,0,1,0,Basal
01BR009,0,0,0,Basal
...,...,...,...,...
21BR001,0,1,1,LumB
21BR002,0,0,1,LumA
21BR010,0,0,1,LumA
22BR005,0,1,1,LumA
