In [6]:
import pandas as pd
from pathlib import Path
import re

COHORT = "CPTAC-BRCA"
METADATA_DIR = Path("/metadata")

sample_matrix_file = METADATA_DIR / f"{COHORT}_sample_matrix.tsv"
clini_file = METADATA_DIR / f"{COHORT}_CLINI.csv"  # output file

In [11]:
df = pd.read_csv(sample_matrix_file, sep="\t")

id_col = "studyID:sampleId"
pattern = re.compile(r"^[\w]+:[A-Z]+(\d[\w]*)$")

patient_id = df[id_col].apply(lambda x: pattern.match(x).group(1))
df["PATIENT"] = patient_id

# Check if there are duplicate values for a feature for a patient
dup = df.drop(columns=[id_col]).groupby("PATIENT").nunique() != 1
dup = dup.index[dup.any(axis=1)]

if len(dup) > 0:
    print(f"Found {len(dup)} patients with multiple values for a feature:")
    print(df[df["PATIENT"].isin(dup)])
    print("Dropping these patients.")
    df = df[~df["PATIENT"].isin(dup)]

df.drop(columns=[id_col, "Altered"], inplace=True)
df.set_index("PATIENT", inplace=True)
df.to_csv(clini_file, index=True)