In [110]:
import config

import pandas as pd
import numpy as np
import joblib
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [111]:
# Loading in the mRNA and clinical data:
clinical_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018\data_clinical_patient.txt", sep="\t", comment="#", low_memory=False)
clinical_df = clinical_df.set_index('PATIENT_ID')

mrna_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep="\t", comment="#")

# There are 527 patients in the mRNA and 529 patients in the clinical data

# The first 2 columns of the mRNA data are labels (Hugo_Symbol then Entrez_Gene_Id). 
# 13 of the genes do not have Hugo_symbols, so for these I will you the Entrex_Gene_Id as the label.
missing_symbols = mrna_df['Hugo_Symbol'].isnull()
mrna_df.loc[missing_symbols, 'Hugo_Symbol'] = mrna_df.loc[missing_symbols, 'Entrez_Gene_Id'].astype(str)

# There are 7 rows that have both the same Hugo_Symbol and Entrez_Gene_Id but different values for the patients.
# I will rename these rows to have unique labels by appending -1-of-2 and -2-of-2 to the Hugo_Symbol.
# Get value counts
counts = mrna_df['Hugo_Symbol'].value_counts()

# Generate unique labels for duplicates
def label_duplicates(value, index):
    if counts[value] == 1:
        return value  # Keep unique values unchanged
    occurrence = mrna_df.groupby('Hugo_Symbol').cumcount() + 1  # Count occurrences per group
    return f"{value}-{occurrence[index]}-of-{counts[value]}"

# Apply the labeling function
mrna_df['Hugo_Symbol'] = [label_duplicates(value, idx) for idx, value in mrna_df['Hugo_Symbol'].items()]

mrna_df = mrna_df.set_index('Hugo_Symbol')
mrna_df = mrna_df.drop(columns="Entrez_Gene_Id") # removing the label column before I transpose the df
mrna_df= mrna_df.transpose() # now the patients are the index and the genes are the columns
mrna_df.index = [id[:-3] for id in mrna_df.index] # removes extranious -01 so that the patient ids match the clinical data



In [112]:
def assign_labels(clinical_df):
    '''given the clinical dataframe, returns the corresposnding labels, 
    assigning 1 for recurrance, 0 for no recurrance, 
    and None if the patient has no recurrence information. 
    Currently uses NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT to identify recurrance.
    If NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT is NaN, uses DSF_STATUS to save the label.'''
    labels = []
    for _, row in clinical_df.iterrows():
        if row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'Yes':
            labels.append(1)
        elif row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'No':
            labels.append(0)
        elif pd.isna(row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT']):
            if row['DFS_STATUS'] == '1:Recurred/Progressed':
                labels.append(1)
            elif row['DFS_STATUS'] == '0:DiseaseFree':
                labels.append(0)
            else:
                labels.append(None)
    return pd.Series(labels, index=clinical_df.index)

    


def drop_patients_missing_data(clinical_df, mrna_df, labels):
    '''Drops patients from both dataframes that are not present in the other dataframe. 
    Drops patients who are missing labeling data used to define recurrence.
    Returns the cleaned dataframes and labels.'''
    # Find patient IDs not shared between the two dataframes:
    clinical_not_in_mrna = set(clinical_df.index) - set(mrna_df.index)
    mrna_not_in_clinical = set(mrna_df.index) - set(clinical_df.index)
    # There are 2 patients ('TCGA-EY-A1GJ', 'TCGA-AP-A0LQ') in the clinical data that are not in the mRNA data.
    clinical_df = clinical_df.drop(index=clinical_not_in_mrna)
    mrna_df = mrna_df.drop(index=mrna_not_in_clinical)
    labels = labels.drop(index=clinical_not_in_mrna)
    labels = labels.drop(index=mrna_not_in_clinical)
    assert clinical_df.shape[0] == mrna_df.shape[0] == labels.shape[0], "Dataframes have different number of patients after cleaning"

    # Now drop patients missing labeling data used to define recurrence:
    patients_no_label = labels[labels.isna()].index
    clinical_df = clinical_df.drop(index=patients_no_label)
    mrna_df = mrna_df.drop(index=patients_no_label)
    labels = labels.drop(index=patients_no_label)
    assert not labels.isna().any(), "Found unlabeled patient after cleaning"

    return clinical_df, mrna_df, labels

In [113]:
def drop_post_diagnosis_clinical_columns(clinical_df):
    '''Removes all columns in the clinical data that are recurrence indicators or are not available at diagnosis.
    Returns the cleaned clinical dataframe and the labels series.'''
    cols_to_drop = [
    "DAYS_LAST_FOLLOWUP",              # follow-up time after diagnosis (future info)
    "FORM_COMPLETION_DATE",            # administrative metadata, not predictive
    "INFORMED_CONSENT_VERIFIED",       # administrative, no biological meaning
    "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT",  # recurrence event → direct leakage
    "PERSON_NEOPLASM_CANCER_STATUS",   # disease status at follow-up → leakage
    "IN_PANCANPATHWAYS_FREEZE",        # technical/analysis flag, not biological
    "OS_STATUS",                       # overall survival outcome → leakage
    "OS_MONTHS",                       # overall survival time → leakage
    "DSS_STATUS",                      # disease-specific survival outcome → leakage
    "DSS_MONTHS",                      # disease-specific survival time → leakage
    "DFS_STATUS",                      # disease-free survival outcome → leakage
    "DFS_MONTHS",                      # disease-free survival time → leakage
    "PFS_STATUS",                      # progression-free survival outcome → leakage
    "PFS_MONTHS"                       # progression-free survival time → leakage
]
    clinical_df = clinical_df.drop(columns=cols_to_drop)
    return clinical_df  

In [114]:
from sklearn.model_selection import train_test_split

def split_train_test(clinical_df, mrna_df, labels, test_size=0.2, random_state=42):
    """
    Splits clinical and mRNA data into train/test sets using precomputed labels.

    Parameters
    ----------
    clinical_df : pd.DataFrame
        Clinical features (indexed by patient ID).
    mrna_df : pd.DataFrame
        mRNA expression features (indexed by patient ID).
    labels : pd.Series
        Precomputed labels indexed by patient ID.
    test_size : float
        Fraction of patients to hold out for testing.
    random_state : int
        Random seed for reproducibility.

    Returns
    -------
    dict of train/test splits:
        {
            "X_clinical_train", "X_clinical_test",
            "X_mrna_train", "X_mrna_test",
            "y_train", "y_test"
        }
    """

    # Train/test split on patient IDs
    train_ids, test_ids = train_test_split(
        labels.index,
        test_size=test_size,
        stratify=labels,
        random_state=random_state
    )

    # Slice dataframes and labels
    splits = {
        "X_clinical_train": clinical_df.loc[train_ids],
        "X_clinical_test":  clinical_df.loc[test_ids],
        "X_mrna_train":     mrna_df.loc[train_ids],
        "X_mrna_test":      mrna_df.loc[test_ids],
        "y_train":          labels.loc[train_ids],
        "y_test":           labels.loc[test_ids]
    }

    return splits

In [115]:
def encode_clinical_features(X_train, X_test, categorical_cols=None, ordinal_cols=None, ordinal_mappings=None):
    """
    Encode clinical features: one-hot for categorical, ordinal for ordinal columns.

    Parameters
    ----------
    X_train : pd.DataFrame
        Training clinical features.
    X_test : pd.DataFrame
        Test clinical features.
    categorical_cols : list of str
        Columns to one-hot encode.
    ordinal_cols : list of str
        Columns to ordinally encode.
    ordinal_mappings : dict
        Mapping of column name -> list of categories in order for ordinal encoding.
        Example: {'TUMOR_GRADE': ['G1', 'G2', 'G3']}

    Returns
    -------
    X_train_encoded, X_test_encoded : pd.DataFrame, pd.DataFrame
        Encoded training and test clinical dataframes.
    """

    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()

    # --- One-hot encode categorical columns ---
    if categorical_cols:
        ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
        ohe_train = ohe.fit_transform(X_train_encoded[categorical_cols])
        ohe_test  = ohe.transform(X_test_encoded[categorical_cols])

        ohe_columns = ohe.get_feature_names_out(categorical_cols)
        ohe_train_df = pd.DataFrame(ohe_train, columns=ohe_columns, index=X_train_encoded.index)
        ohe_test_df  = pd.DataFrame(ohe_test, columns=ohe_columns, index=X_test_encoded.index)

        X_train_encoded = pd.concat([X_train_encoded.drop(columns=categorical_cols), ohe_train_df], axis=1)
        X_test_encoded  = pd.concat([X_test_encoded.drop(columns=categorical_cols), ohe_test_df], axis=1)

    # --- Ordinal encode ordinal columns ---
    if ordinal_cols:
        if ordinal_mappings is None:
            raise ValueError("You must provide ordinal_mappings when encoding ordinal columns.")

        for col in ordinal_cols:
            encoder = OrdinalEncoder(categories=[ordinal_mappings[col]])
            X_train_encoded[[col]] = encoder.fit_transform(X_train_encoded[[col]])
            X_test_encoded[[col]]  = encoder.transform(X_test_encoded[[col]])

    return X_train_encoded, X_test_encoded


In [None]:
labels = assign_labels(clinical_df)
clinical_df, mrna_df, labels = drop_patients_missing_data(clinical_df, mrna_df, labels)
clinical_df = drop_post_diagnosis_clinical_columns(clinical_df)
splits = split_train_test(clinical_df, mrna_df, labels)

X_clinical_train = splits["X_clinical_train"]
X_clinical_test  = splits["X_clinical_test"]
X_mrna_train     = splits["X_mrna_train"]
X_mrna_test      = splits["X_mrna_test"]
y_train          = splits["y_train"]
y_test           = splits["y_test"]

clinical_df, cols_removed = proccess_clincal_df(X_clinical_train)

# should be removed CANCER_TYPE_ACRONYM, OTHER_PATIENT_ID, SEX, AJCC_PATHOLOGIC_TUMOR_STAGE, DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS, HISTORY_NEOADJUVANT_TRTYN, PATH_M_STAGE
# PATH_N_STAGE, PATH_T_STAGE, PRIMARY_LYMPH_NODE_PRESENTATION_ASSESSMENT

# weird AJCC_STAGING_EDITION
# remove one because are equal are_equal = X_clinical_train["ICD_10"].equals(X_clinical_train["ICD_O_3_SITE"])

categorical_cols = ['SUBTYPE', 'ETHNICITY', "ICD_10", "ICD_O_3_HISTOLOGY", "PRIOR_DX", "RACE", "RADIATION_THERAPY", "GENETIC_ANCESTRY_LABEL"]

# X_clinical_train_encoded, X_clinical_test_encoded = encode_clinical_features(
#     X_clinical_train, 
#     X_clinical_test, 
#     categorical_cols=categorical_cols, 
#     ordinal_cols=ordinal_cols, 
#     ordinal_mappings=ordinal_mappings
# )



Index(['SUBTYPE', 'CANCER_TYPE_ACRONYM', 'OTHER_PATIENT_ID', 'AGE', 'SEX',
       'AJCC_PATHOLOGIC_TUMOR_STAGE', 'AJCC_STAGING_EDITION', 'DAYS_TO_BIRTH',
       'DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS', 'ETHNICITY',
       'HISTORY_NEOADJUVANT_TRTYN', 'ICD_10', 'ICD_O_3_HISTOLOGY',
       'ICD_O_3_SITE', 'PATH_M_STAGE', 'PATH_N_STAGE', 'PATH_T_STAGE',
       'PRIMARY_LYMPH_NODE_PRESENTATION_ASSESSMENT', 'PRIOR_DX', 'RACE',
       'RADIATION_THERAPY', 'WEIGHT', 'GENETIC_ANCESTRY_LABEL'],
      dtype='object')
0


In [142]:
print(X_clinical_train.columns)
print(X_clinical_train["GENETIC_ANCESTRY_LABEL"])
print(X_clinical_train["GENETIC_ANCESTRY_LABEL"].unique())
are_equal = X_clinical_train["ICD_10"].equals(X_clinical_train["ICD_O_3_SITE"])
print("Columns are exactly the same:", are_equal)


Index(['SUBTYPE', 'CANCER_TYPE_ACRONYM', 'OTHER_PATIENT_ID', 'AGE', 'SEX',
       'AJCC_PATHOLOGIC_TUMOR_STAGE', 'AJCC_STAGING_EDITION', 'DAYS_TO_BIRTH',
       'DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS', 'ETHNICITY',
       'HISTORY_NEOADJUVANT_TRTYN', 'ICD_10', 'ICD_O_3_HISTOLOGY',
       'ICD_O_3_SITE', 'PATH_M_STAGE', 'PATH_N_STAGE', 'PATH_T_STAGE',
       'PRIMARY_LYMPH_NODE_PRESENTATION_ASSESSMENT', 'PRIOR_DX', 'RACE',
       'RADIATION_THERAPY', 'WEIGHT', 'GENETIC_ANCESTRY_LABEL'],
      dtype='object')
PATIENT_ID
TCGA-EY-A1GC    EUR
TCGA-D1-A0ZV    EUR
TCGA-B5-A3F9    AFR
TCGA-BG-A0M9    EUR
TCGA-AX-A0IZ    EUR
               ... 
TCGA-AP-A05J    EUR
TCGA-EY-A547    AFR
TCGA-B5-A0K9    EUR
TCGA-D1-A17N    EUR
TCGA-AX-A3GB    EUR
Name: GENETIC_ANCESTRY_LABEL, Length: 396, dtype: object
['EUR' 'AFR' 'AFR_ADMIX' 'EAS' ' ' 'SAS' 'AMR' 'ADMIX']
Columns are exactly the same: True
