## UCEC Reccurence Notebook - Pre-processing
This program processes data from ucec_tcga_pan_can_atlas_2018 to be ready for training a machine learning algorithm to predict recurrence. 

In [17]:
import config

import pandas as pd
import numpy as np
import joblib
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [18]:
# Loading in the mRNA and clinical data:
mrna_df = pd.read_csv("../ucec_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep="\t", comment="#")

clinical_df = pd.read_csv("../ucec_tcga_pan_can_atlas_2018\data_clinical_patient.txt", sep="\t", comment="#", low_memory=False)
clinical_df = clinical_df.set_index('PATIENT_ID')
# There are 527 patients in the mRNA and 529 patients in the clinical data

# The first 2 columns of the mRNA data are labels (Hugo_Symbol then Entrez_Gene_Id). 
# 13 of the genes do not have Hugo_symbols, so for these I will you the Entrex_Gene_Id as the label.
missing_symbols = mrna_df['Hugo_Symbol'].isnull()
mrna_df.loc[missing_symbols, 'Hugo_Symbol'] = mrna_df.loc[missing_symbols, 'Entrez_Gene_Id'].astype(str)

# There are 7 rows that have both the same Hugo_Symbol and Entrez_Gene_Id but different values for the patients.
# I will rename these rows to have unique labels by appending -1-of-2 and -2-of-2 to the Hugo_Symbol.
# Get value counts
counts = mrna_df['Hugo_Symbol'].value_counts()

# Generate unique labels for duplicates
def label_duplicates(value, index):
    if counts[value] == 1:
        return value  # Keep unique values unchanged
    occurrence = mrna_df.groupby('Hugo_Symbol').cumcount() + 1  # Count occurrences per group
    return f"{value}-{occurrence[index]}-of-{counts[value]}"

# Apply the labeling function
mrna_df['Hugo_Symbol'] = [label_duplicates(value, idx) for idx, value in mrna_df['Hugo_Symbol'].items()]

mrna_df = mrna_df.set_index('Hugo_Symbol')
mrna_df = mrna_df.drop(columns="Entrez_Gene_Id") # removing the label column before I transpose the df
mrna_df= mrna_df.transpose() # now the patients are the index and the genes are the columns
mrna_df.index = [id[:-3] for id in mrna_df.index] # removes extranious -01 so that the patient ids match the clinical data

def drop_patients_missing_data(clinical_df, mrna_df):
    # Find patient IDs not shared between the two dataframes:
    clinical_not_in_mrna = set(clinical_df.index) - set(mrna_df.index)
    mrna_not_in_clinical = set(mrna_df.index) - set(clinical_df.index)
    # There are 2 patients ('TCGA-EY-A1GJ', 'TCGA-AP-A0LQ') in the clinical data that are not in the mRNA data.
    clinical_df = clinical_df.drop(index=clinical_not_in_mrna)
    mrna_df = mrna_df.drop(index=mrna_not_in_clinical)
    return clinical_df, mrna_df

clinical_df, mrna_df = drop_patients_missing_data(clinical_df, mrna_df)
# Now both dataframes have 527 patients

In [19]:
# Check that all column labels in mrna_df are strings
non_str_cols = [col for col in mrna_df.columns if not isinstance(col, str)]
if non_str_cols: 
    raise ValueError(f"Non-string column labels found: {non_str_cols}")

In [20]:
# # Genes from https://pmc.ncbi.nlm.nih.gov/articles/PMC7565375/ 
# # and https://pmc.ncbi.nlm.nih.gov/articles/PMC9929804/ FIXME: look more into this later
# literature_genes = set([
#     "MLH1", "MSH2", "MSH6", "PMS2", "PTEN", "POLD1", "POLE", "NTHL1", "MUTYH", "BRCA1", "GINS4", "ESR1"
# ])


# def prune_correlated_features(df, threshold=0.90, literature_genes=set()):
#     corr_matrix = df.corr().abs() # this line takes a while to run
#     np.fill_diagonal(corr_matrix.values, 0) # ignore self-correlation

#     # Build adjacency map of correlations above threshold
#     high_corr_map = {
#         gene: set(corr_matrix.index[corr_matrix.loc[gene] >= threshold])
#         for gene in corr_matrix.columns
#     }

#     genes_to_keep = set(corr_matrix.columns) # start with all genes

#     while True:
#         # Find genes that are still correlated
#         correlated_genes = {g: nbrs for g, nbrs in high_corr_map.items() if nbrs & genes_to_keep}
#         if not correlated_genes:
#             break

#         # Count connections for each gene
#         degrees = {g: len(nbrs & genes_to_keep) for g, nbrs in correlated_genes.items() if g in genes_to_keep}
#         if not degrees:
#             break

#         # Choose candidate for removal
#         worst_gene = max(degrees, key=lambda g: degrees[g])

#         # If literature gene vs. non-literature → skip removal of literature
#         if worst_gene in literature_genes:
#             # Try removing one of its correlated non-literature neighbors instead
#             neighbors = correlated_genes[worst_gene] & genes_to_keep
#             non_lit_neighbors = [n for n in neighbors if n not in literature_genes]
#             if non_lit_neighbors:
#                 worst_gene = min(non_lit_neighbors, key=lambda n: df[n].var())
#             else:
#                 # Can't drop a lit gene or its only neighbors → break
#                 break
#         else:
#             # break ties by variance (drop lower variance gene)
#             ties = [g for g, d in degrees.items() if d == degrees[worst_gene]]
#             if len(ties) > 1:
#                 worst_gene = min(ties, key=lambda g: df[g].var())

#         genes_to_keep.remove(worst_gene)

#     return df[list(genes_to_keep)]

# og_mrna_shape = mrna_df.shape
# mrna_df = prune_correlated_features(
#     mrna_df, 
#     threshold=config.CORRELATION_THRESHOLD, 
#     literature_genes=literature_genes)
# print("removed ", og_mrna_shape - mrna_df.shape[1], " features")
# joblib.dump(mrna_df, "data/pruned_mrna_df.pkl")


In [21]:
mrna_df = joblib.load("../data/pruned_mrna_df.pkl")

In [22]:

def generate_recurrence_labels(treatment_file, status_file, clinical_file):
    """
    Generates a pd.Series of recurrence labels for all patients.
    
    Label rules:
     1 (recurred): 
        * ANATOMIC_TREATMENT_SITE = "Local Recurrence" or "Distant Recurrence"
        * REGIMEN_INDICATION = "Recurrence"
        * STATUS = "Locoregional Recurrence"
        * NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT = "Yes"
     0 (no recurrence): NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT = "No" (and no other columns show recurrence)
     None (unknown): all other patients
    """
    
    # --- Load data ---
    df_treatment = pd.read_csv(treatment_file, sep="\t", comment="#", low_memory=False)
    df_status = pd.read_csv(status_file, sep="\t", comment="#", low_memory=False)
    df_clinical = pd.read_csv(clinical_file, sep="\t", comment="#", low_memory=False)
    
    # Ensure PATIENT_ID is a column
    if df_treatment.index.name == "PATIENT_ID":
        df_treatment = df_treatment.reset_index()
    if df_clinical.index.name == "PATIENT_ID":
        df_clinical = df_clinical.reset_index()
    
    # --- Set of patient IDs labeled as recurrence ---
    recur_patients = set()
    
    # From treatment file
    treatment_mask = df_treatment["ANATOMIC_TREATMENT_SITE"].isin(["Local Recurrence", "Distant Recurrence"])
    regimen_mask = df_treatment["REGIMEN_INDICATION"] == "Recurrence"
    recur_patients.update(df_treatment.loc[treatment_mask | regimen_mask, "PATIENT_ID"])
    
    # From status file
    status_mask = df_status["STATUS"].astype(str).str.strip() == "Locoregional Recurrence"
    recur_patients.update(df_status.loc[status_mask, "PATIENT_ID"])
    
    # From clinical file
    clinical_yes_mask = df_clinical["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"].astype(str).str.strip().str.lower() == "yes"
    recur_patients.update(df_clinical.loc[clinical_yes_mask, "PATIENT_ID"])
    
    # --- Set of patients labeled as no recurrence ---
    clinical_no_mask = df_clinical["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"].astype(str).str.strip().str.lower() == "no"
    no_recur_patients = set(df_clinical.loc[clinical_no_mask, "PATIENT_ID"])
    
    # --- Combine all patient IDs ---
    all_patients = set(df_clinical["PATIENT_ID"]) | set(df_treatment["PATIENT_ID"]) | set(df_status["PATIENT_ID"])
    
    # --- Assign labels ---
    labels = {}
    for pid in all_patients:
        if pid in recur_patients:
            labels[pid] = 1
        elif pid in no_recur_patients:
            labels[pid] = 0
        else:
            labels[pid] = None
    
    # Return as pd.Series
    label_series = pd.Series(labels, name="Recurrence_Label")
    label_series.index.name = "PATIENT_ID"
    
    return label_series

def drop_highly_uniform_columns(df, threshold=0.99):
    """
    Drops columns where more than 'threshold' proportion of non-NaN values are the same.

    Parameters:
    - df: pandas DataFrame
    - threshold: float (default 0.99), proportion threshold to drop columns

    Returns:
    - pandas DataFrame with specified columns dropped
    """
    cols_to_drop = []
    for col in df.columns:
        non_na_values = df[col].dropna()
        if not non_na_values.empty:
            top_freq = non_na_values.value_counts(normalize=True).iloc[0]
            if top_freq > threshold:
                cols_to_drop.append(col)
    return df.drop(columns=cols_to_drop)


In [23]:
# Drop rows with no recurrence label
clinical_df = clinical_df.dropna(
    subset=["DFS_STATUS", "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"],
    how="all"
)
mrna_df = mrna_df.loc[clinical_df.index]

labels = generate_recurrence_labels(
    treatment_file="../ucec_tcga_pan_can_atlas_2018/data_timeline_treatment.txt",
    status_file="../ucec_tcga_pan_can_atlas_2018/data_timeline_status.txt",
    clinical_file="../ucec_tcga_pan_can_atlas_2018/data_clinical_patient.txt"
)

def drop_patients_missing_data(clinical_df, mrna_df, labels):
    """
    Drops patients not shared across clinical_df, mrna_df, and labels.
    Also drops patients missing labeling data (None or NaN).
    
    Returns:
        clinical_df_clean, mrna_df_clean, labels_clean
    """
    # Step 1: Find shared patient IDs (preserve order)
    shared_patients = clinical_df.index.intersection(mrna_df.index).intersection(labels.index)
    
    # Step 2: Subset all three to shared patients, in the same order
    clinical_df_clean = clinical_df.loc[shared_patients].copy()
    mrna_df_clean = mrna_df.loc[shared_patients].copy()
    labels_clean = labels.loc[shared_patients].copy()
    
    # Step 3: Drop patients with missing labels (None/NaN)
    valid_patients = labels_clean[labels_clean.notna()].index
    clinical_df_clean = clinical_df_clean.loc[valid_patients]
    mrna_df_clean = mrna_df_clean.loc[valid_patients]
    labels_clean = labels_clean.loc[valid_patients]
    
    # Step 4: Sanity checks
    assert clinical_df_clean.shape[0] == mrna_df_clean.shape[0] == labels_clean.shape[0], \
        "Dataframes have different number of patients after cleaning"
    assert not labels_clean.isna().any(), "Found unlabeled patient after cleaning"
    assert clinical_df_clean.index.equals(mrna_df_clean.index) and clinical_df_clean.index.equals(labels_clean.index), \
        "Indexes are not aligned"
    
    return clinical_df_clean, mrna_df_clean, labels_clean

clinical_df, mrna_df, labels = drop_patients_missing_data(clinical_df, mrna_df, labels)


In [24]:
from sklearn.model_selection import train_test_split

def split_train_test(clinical_df, mrna_df, labels, test_size=0.2, random_state=1):
    """
    Splits clinical and mRNA data into train/test sets using precomputed labels.

    Parameters
    ----------
    clinical_df : pd.DataFrame
        Clinical features (indexed by patient ID).
    mrna_df : pd.DataFrame
        mRNA expression features (indexed by patient ID).
    labels : pd.Series
        Precomputed labels indexed by patient ID.
    test_size : float
        Fraction of patients to hold out for testing.
    random_state : int
        Random seed for reproducibility.

    Returns
    -------
    dict of train/test splits:
        {
            "X_clinical_train", "X_clinical_test",
            "X_mrna_train", "X_mrna_test",
            "y_train", "y_test"
        }
    """

    # Train/test split on patient IDs
    train_ids, test_ids = train_test_split(
        labels.index,
        test_size=test_size,
        stratify=labels,
        random_state=random_state
    )

    # Slice dataframes and labels
    splits = {
        "X_clinical_train": clinical_df.loc[train_ids],
        "X_clinical_test":  clinical_df.loc[test_ids],
        "X_mrna_train":     mrna_df.loc[train_ids],
        "X_mrna_test":      mrna_df.loc[test_ids],
        "y_train":          labels.loc[train_ids],
        "y_test":           labels.loc[test_ids]
    }

    return splits

splits = split_train_test(clinical_df, mrna_df, labels, test_size=0.2, random_state=1)

clinical_train = splits["X_clinical_train"]
clinical_test  = splits["X_clinical_test"]
mrna_train     = splits["X_mrna_train"]
mrna_test      = splits["X_mrna_test"]
y_train          = splits["y_train"]
y_test           = splits["y_test"]



In [25]:
# remove the column if over MAX_NULL_FRAC percent null values
print("length before removing null-heavy columns:", len(clinical_train.columns))
original_column_set = set(clinical_train.columns)
clinical_train = clinical_train.dropna(axis=1, thresh=len(clinical_train) * (1 - config.MAX_NULL_FRAC))
print("length after removing null-heavy columns:", len(clinical_train.columns))
print(original_column_set - set(clinical_train.columns)) # checking which columns were removed


print("length before removing null-heavy columns:", len(mrna_train.columns))
original_column_set = set(mrna_train.columns)
mrna_train = mrna_train.dropna(axis=1, thresh=len(mrna_train) * (1 - config.MAX_NULL_FRAC))
print("length after removing null-heavy columns:", len(mrna_train.columns))
print(original_column_set - set(mrna_train.columns)) # checking which columns were removed

# remove columns where over 99% of the non-null values are the same
clinical_train = drop_highly_uniform_columns(clinical_train)

# remove columns where over 99% of the non-null values are the same
mrna_train = drop_highly_uniform_columns(mrna_train)


cols_to_drop = [
    "DAYS_LAST_FOLLOWUP",              # follow-up time after diagnosis (future info)
    "FORM_COMPLETION_DATE",            # administrative metadata, not predictive
    "INFORMED_CONSENT_VERIFIED",       # administrative, no biological meaning
    "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT",  # recurrence event → direct leakage
    "PERSON_NEOPLASM_CANCER_STATUS",   # disease status at follow-up → leakage
    "IN_PANCANPATHWAYS_FREEZE",        # technical/analysis flag, not biological
    "OS_STATUS",                       # overall survival outcome → leakage
    "OS_MONTHS",                       # overall survival time → leakage
    "DSS_STATUS",                      # disease-specific survival outcome → leakage
    "DSS_MONTHS",                      # disease-specific survival time → leakage
    "DFS_STATUS",                      # disease-free survival outcome → leakage
    "DFS_MONTHS",                      # disease-free survival time → leakage
    "PFS_STATUS",                      # progression-free survival outcome → leakage
    "PFS_MONTHS",                       # progression-free survival time → leakage
    "CANCER_TYPE_ACRONYM",
    "OTHER_PATIENT_ID",
    "SEX",
    "AJCC_PATHOLOGIC_TUMOR_STAGE",
    "DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS",
    "HISTORY_NEOADJUVANT_TRTYN",
    "PATH_M_STAGE",
    "ICD_O_3_SITE", # removed because is the same as ICD_10
    "ICD_O_3_"
]

# remove non-informational columns
clinical_train = clinical_train.drop(columns=cols_to_drop, errors='ignore')

length before removing null-heavy columns: 37
length after removing null-heavy columns: 31
{'PRIMARY_LYMPH_NODE_PRESENTATION_ASSESSMENT', 'PATH_T_STAGE', 'PATH_M_STAGE', 'AJCC_PATHOLOGIC_TUMOR_STAGE', 'PATH_N_STAGE', 'ETHNICITY'}
length before removing null-heavy columns: 20359
length after removing null-heavy columns: 17372
{'C4orf35', 'CARD17', 'LINC00298', 'C6orf10', 'OR2W5', 'IFNA21', 'RLBP1', 'DUX4L1', 'FSCB', 'HTR3D', 'IFNA13', 'SCGB1D2', 'TRPC7', 'OR51A7', 'SNORD115-41', 'BLID', 'VENTXP1', 'SNORA71B', 'KRT27', 'TBC1D28', 'PWAR1', 'OR52A4', 'GK2', 'OR8B2', 'ARR3', 'LINC01734', 'PRAMEF12', 'OPRM1', 'CACNG1', 'OR4A15', 'CYP2A13', 'DYDC1', 'SCN10A', 'C8orf71', 'SNORA80', 'NLRP14', 'FAM27B', 'AKR1C6P', 'C18orf62', 'MCCD1', 'LYZL6', 'PABPC1L2A', 'HRK', 'IL17A', 'SOX3', 'LOC100133893', 'GABRA1', 'IQCF2', 'SULT6B1', 'OR7E5P', 'AMAC1L3', 'FAM170A', 'NXF5', 'OR2T1', 'ZP4', 'DMBT1L1', 'C9orf106', 'DDX3Y', 'SNORD114-30', 'TMIGD1', 'AFM', 'KRT2', 'CNGB3', 'PADI6', 'SNORA66', 'LRRC30', 'PCDH8

In [26]:
categorical_columns = ["ICD_10", 
                        "PRIOR_DX", 
                        "RACE",
                        "RADIATION_THERAPY", 
                        "GENETIC_ANCESTRY_LABEL"] #FIXME: do further research on what ICD_10 and ICD_O_3_SITE are


In [27]:
X_train = clinical_train.join(mrna_train, how='inner')

# --- Numerical ---
numerical_df = X_train.select_dtypes(include=['number']).copy()

# Save medians for test-time
num_medians = numerical_df.median()
numerical_df = numerical_df.fillna(num_medians)

# --- Categorical ---
categorical_df = X_train[categorical_columns].copy()

# Save modes for test-time
cat_modes = categorical_df.mode().iloc[0]   # mode() returns a DataFrame, take first row
categorical_df = categorical_df.fillna(cat_modes)

# One-Hot Encode (drop first to avoid redundancy)
categorical_df = pd.get_dummies(categorical_df, drop_first=True, dtype=float)

X_train = pd.concat([numerical_df, categorical_df], axis=1)


In [28]:
X_test = clinical_test.join(mrna_test, how='inner')

numerical_test = X_test.select_dtypes(include=['number']).copy()
numerical_test = numerical_test.fillna(num_medians)  # <- use train medians

categorical_test = X_test[categorical_columns].copy()
categorical_test = categorical_test.fillna(cat_modes)  # <- use train modes
categorical_test = pd.get_dummies(categorical_test, drop_first=True, dtype=float)

# Align test with training columns
categorical_test = categorical_test.reindex(columns=categorical_df.columns, fill_value=0)



# Combine back
X_test = pd.concat([numerical_test, categorical_test], axis=1)

# X_test, y_train, y_test need to be preprocessed the same way as X_train
X_test = X_test[X_train.columns]

Saving my split of training and testing data

In [29]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(375, 17392) (94, 17392) (375,) (94,)


In [33]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

joblib.dump(X_train, "../data/new_labels_old_preproc/X_train.pkl")
joblib.dump(X_test, "../data/new_labels_old_preproc/X_test.pkl")
joblib.dump(y_train, "../data/new_labels_old_preproc/y_train.pkl")
joblib.dump(y_test, "../data/new_labels_old_preproc/y_test.pkl")
# joblib.dump(X.columns, "../data/new_labels_old_preproc/columns.pkl")
# # # joblib.dump(X.columns, config.FEATURE_NAMES)

(375, 17392) (94, 17392) (375,) (94,)


['../data/new_labels_old_preproc/y_test.pkl']