## UCEC Reccurence Notebook - Pre-processing
This program processes data from ucec_tcga_pan_can_atlas_2018 to be ready for training a machine learning algorithm to predict recurrence. 

In [25]:
import config

import pandas as pd
import numpy as np
import joblib
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
# Loading in the mRNA and clinical data:
mrna_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep="\t", comment="#")

clinical_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018\data_clinical_patient.txt", sep="\t", comment="#", low_memory=False)
clinical_df = clinical_df.set_index('PATIENT_ID')
# There are 527 patients in the mRNA and 529 patients in the clinical data

# The first 2 columns of the mRNA data are labels (Hugo_Symbol then Entrez_Gene_Id). 
# 13 of the genes do not have Hugo_symbols, so for these I will you the Entrex_Gene_Id as the label.
missing_symbols = mrna_df['Hugo_Symbol'].isnull()
mrna_df.loc[missing_symbols, 'Hugo_Symbol'] = mrna_df.loc[missing_symbols, 'Entrez_Gene_Id'].astype(str)

# There are 7 rows that have both the same Hugo_Symbol and Entrez_Gene_Id but different values for the patients.
# I will rename these rows to have unique labels by appending -1-of-2 and -2-of-2 to the Hugo_Symbol.
# Get value counts
counts = mrna_df['Hugo_Symbol'].value_counts()

# Generate unique labels for duplicates
def label_duplicates(value, index):
    if counts[value] == 1:
        return value  # Keep unique values unchanged
    occurrence = mrna_df.groupby('Hugo_Symbol').cumcount() + 1  # Count occurrences per group
    return f"{value}-{occurrence[index]}-of-{counts[value]}"

# Apply the labeling function
mrna_df['Hugo_Symbol'] = [label_duplicates(value, idx) for idx, value in mrna_df['Hugo_Symbol'].items()]

mrna_df = mrna_df.set_index('Hugo_Symbol')
mrna_df = mrna_df.drop(columns="Entrez_Gene_Id") # removing the label column before I transpose the df
mrna_df= mrna_df.transpose() # now the patients are the index and the genes are the columns
mrna_df.index = [id[:-3] for id in mrna_df.index] # removes extranious -01 so that the patient ids match the clinical data

# Find patient IDs not shared between the two dataframes:
clinical_not_in_mrna = set(clinical_df.index) - set(mrna_df.index)
print("Patient IDs in clinical data but not in mRNA data:", clinical_not_in_mrna)
# There are 2 patients ('TCGA-EY-A1GJ', 'TCGA-AP-A0LQ') in the clinical data that are not in the mRNA data.
# They will be dropped when I join the dataframes.


Patient IDs in clinical data but not in mRNA data: {'TCGA-AP-A0LQ', 'TCGA-EY-A1GJ'}


In [None]:
# ========= STEP 1: Keep literature genes =========
# Genes from https://pmc.ncbi.nlm.nih.gov/articles/PMC7565375/, FIXME: look more into this later
literature_genes = set([
    "MLH1", "MSH2", "MSH6", "PMS2", "PTEN"
])

# Compute absolute correlation matrix
corr_matrix = mrna_df.corr().abs()

# Get all correlated pairs above threshold
threshold = 0.90
pairs = [(i, j) for i in corr_matrix.columns for j in corr_matrix.columns 
         if i < j and corr_matrix.loc[i, j] >= threshold]

genes_to_drop = set()

for g1, g2 in pairs:
    if g1 in literature_genes and g2 not in literature_genes:
        genes_to_drop.add(g2)
    elif g2 in literature_genes and g1 not in literature_genes:
        genes_to_drop.add(g1)

# Drop non-literature genes from correlated pairs
mrna_step1 = mrna_df.drop(columns=list(genes_to_drop))
print(f"Step 1 removed {len(genes_to_drop)} genes")

# ========= STEP 2: Iteratively prune correlation hubs =========
def prune_correlated_features(df, threshold=0.90):
    corr_matrix = df.corr().abs()
    np.fill_diagonal(corr_matrix.values, 0)

    while (corr_matrix >= threshold).any().any():
        # Count how many high-corr relationships each gene has
        corr_counts = (corr_matrix >= threshold).sum()
        max_corr = corr_counts.max()

        # Genes tied for max connections
        candidates = corr_counts[corr_counts == max_corr].index.tolist()

        # Break ties by variance (drop lower variance gene)
        variances = df[candidates].var()
        gene_to_remove = variances.idxmin()

        # Drop the chosen gene
        df = df.drop(columns=[gene_to_remove])

        # Recompute correlation matrix
        corr_matrix = df.corr().abs()
        np.fill_diagonal(corr_matrix.values, 0)

    return df

mrna_final = prune_correlated_features(mrna_step1, threshold=0.90)
print(f"Final shape after pruning: {mrna_final.shape}")


In [None]:
# Data frame for clinical and genetic data. It has 527 patients (rows) and 20568 features (columns).
df = clinical_df.join(mrna_df, how='inner') 

# Check that all column labels in mrna_df are strings
non_str_cols = [col for col in df.columns if not isinstance(col, str)]
if non_str_cols: 
    raise ValueError(f"Non-string column labels found: {non_str_cols}")

In [27]:
df.loc["TCGA-BK-A139"][:55] # example patient data for a patient with recurrence This is the only patient that I am postive experienced recurrence

SUBTYPE                                                               UCEC_CN_HIGH
CANCER_TYPE_ACRONYM                                                           UCEC
OTHER_PATIENT_ID                              8d9e4917-334b-4c76-aee1-1e22be772db0
AGE                                                                           74.0
SEX                                                                         Female
AJCC_PATHOLOGIC_TUMOR_STAGE                                                    NaN
AJCC_STAGING_EDITION                                                           NaN
DAYS_LAST_FOLLOWUP                                                             309
DAYS_TO_BIRTH                                                             -27077.0
DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS                                           0.0
ETHNICITY                                                   Not Hispanic Or Latino
FORM_COMPLETION_DATE                                                       2/23/11
HIST

The following genes appear in the data more than once but have different data: 
['PALM2AKAP2', 'ELMOD1', 'FGF13', 'QSOX1', 'SNAP47', 'NKAIN3', 'TMEM8B']
Right now, I'm leaving every version of the gene in, but giving placeholder unique names like: PALM2AKAP2-1-of-2 and PALM2AKAP2-2-of-2

Removing unecessary columns from clinical data.

In [28]:
# testing to see if this informationless column is removed in teh pre processing (it should be)
nonzero_patients = df[df["DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS"] != 0]
print(nonzero_patients["DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS"])


TCGA-AJ-A3BH   NaN
TCGA-DF-A2KN   NaN
TCGA-DF-A2KR   NaN
TCGA-DF-A2KS   NaN
TCGA-DF-A2KU   NaN
TCGA-DF-A2L0   NaN
Name: DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS, dtype: float64


In [29]:

def assign_label(row):
    '''given a row assigns 1 for recurrance and 0 for no recurrance. 
    Currently uses NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT to identify recurrance.
    If NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT is NaN, uses DSF_STATUS to save the label.'''
    if row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'Yes':
        return 1
    elif row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'No':
        return 0
    elif pd.isna(row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT']):
        if row['DFS_STATUS'] == '1:Recurred/Progressed':
            return 1
        elif row['DFS_STATUS'] == '0:DiseaseFree':
            return 0
        else:
            raise ValueError(f"Error: Both columns are NaN at index {row.name}")

def drop_highly_uniform_columns(df, threshold=0.99):
    """
    Drops columns where more than 'threshold' proportion of non-NaN values are the same.

    Parameters:
    - df: pandas DataFrame
    - threshold: float (default 0.99), proportion threshold to drop columns

    Returns:
    - pandas DataFrame with specified columns dropped
    """
    cols_to_drop = []
    for col in df.columns:
        non_na_values = df[col].dropna()
        if not non_na_values.empty:
            top_freq = non_na_values.value_counts(normalize=True).iloc[0]
            if top_freq > threshold:
                cols_to_drop.append(col)
    return df.drop(columns=cols_to_drop)


# remove the column if over MAX_NULL_VALS percent null values
df = df.dropna(axis=1, thresh=len(df) * (1 - config.MAX_NULL_VALS))

# remove columns where over 99% of the non-null values are the same
df = drop_highly_uniform_columns(df)


# remove non-informational columns
df = df.drop(columns=['OTHER_PATIENT_ID'])

In [30]:
from collections import Counter
import numpy as np

# Group counts for label-related columns
pair_counts = (
    df.groupby(
        ["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT", "DFS_STATUS", "PFS_STATUS"],
        dropna=False
    )
    .size()
    .reset_index(name="Count")
)

# Nicely print the pairings and counts
print("\n=== Pair Counts (Before Dropping Missing Labels) ===")
print(pair_counts.to_string(index=False))

# Drop rows with no recurrence label
df = df.dropna(
    subset=["DFS_STATUS", "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"],
    how="all"
)

# Labels array for recurrence
labels = np.array(df.apply(assign_label, axis=1))
label_counts = Counter(labels)

# Pretty print label distribution
print("\n=== Label Distribution (After Cleaning) ===")
total = sum(label_counts.values())
for label, count in label_counts.items():
    pct = (count / total) * 100
    print(f"{label:12} : {count:4} ({pct:5.1f}%)")



=== Pair Counts (Before Dropping Missing Labels) ===
NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT            DFS_STATUS    PFS_STATUS  Count
                                     No         0:DiseaseFree    0:CENSORED    325
                                     No         0:DiseaseFree 1:PROGRESSION      1
                                     No 1:Recurred/Progressed 1:PROGRESSION     12
                                     No                   NaN    0:CENSORED     37
                                     No                   NaN 1:PROGRESSION     12
                                    Yes         0:DiseaseFree 1:PROGRESSION      7
                                    Yes 1:Recurred/Progressed 1:PROGRESSION     38
                                    Yes                   NaN 1:PROGRESSION     34
                                    NaN         0:DiseaseFree    0:CENSORED     24
                                    NaN 1:Recurred/Progressed 1:PROGRESSION      5
                                 

In [31]:
# Just used for looking at the data for the labels
pair_counts = df.groupby(["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT", 'DFS_STATUS', "PFS_STATUS"], dropna=False).size().reset_index(name='Count')

# Print the pairings and the count
print(pair_counts)

# Removes the 32 rows where we have no recurrance label (neither NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT nor DFS_STATUS are known)
df = df.dropna(subset=['DFS_STATUS', 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'], how='all')

# numpy array for the labels for recurrance
labels = np.array(df.apply(assign_label, axis=1)) 
print(Counter(labels))


  NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT             DFS_STATUS  \
0                                      No          0:DiseaseFree   
1                                      No          0:DiseaseFree   
2                                      No  1:Recurred/Progressed   
3                                      No                    NaN   
4                                      No                    NaN   
5                                     Yes          0:DiseaseFree   
6                                     Yes  1:Recurred/Progressed   
7                                     Yes                    NaN   
8                                     NaN          0:DiseaseFree   
9                                     NaN  1:Recurred/Progressed   

      PFS_STATUS  Count  
0     0:CENSORED    325  
1  1:PROGRESSION      1  
2  1:PROGRESSION     12  
3     0:CENSORED     37  
4  1:PROGRESSION     12  
5  1:PROGRESSION      7  
6  1:PROGRESSION     38  
7  1:PROGRESSION     34  
8     0:CENSORED 

Transforms data by changing catagorical data into numerical data and filling in missing data points with medians or modes.

In [32]:
# Testing  that pre-processing worked as expected ##########################################################

# finding columns with NaN values after dropping some columns and rows
nan_counts = df.isnull().sum()
nonzero_nans = nan_counts[nan_counts > 0]
print("Columns with NaN values:")
print(nonzero_nans)
print("")

print("Rows with more than 80% the same value in a column:")
def check_dominant_columns(df, threshold=0.8):
    """
    Print columns where the most frequent value accounts for at least
    `threshold` fraction of the entries.

    Parameters
    ----------
    df : pandas DataFrame
        The dataset to check.
    threshold : float (default=0.8)
        Proportion cutoff for dominance.
    """
    n_rows = len(df)
    for col in df.columns:
        # Get the most frequent value count
        top_value_count = df[col].value_counts(dropna=False).iloc[0]
        proportion = top_value_count / n_rows

        if proportion >= threshold:
            top_value = df[col].value_counts(dropna=False).idxmax()
            print(f"Column: {col}")
            print(f"  Most frequent value: {top_value}")
            print(f"  Count: {top_value_count}/{n_rows} ({proportion:.1%})")
            print(f"  Unique values: {df[col].nunique(dropna=False)}")
            print("-" * 50)

check_dominant_columns(df, threshold=0.8)

Columns with NaN values:
SUBTYPE                                    19
AGE                                         2
AJCC_STAGING_EDITION                       69
DAYS_LAST_FOLLOWUP                         33
DAYS_TO_BIRTH                               3
NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT    29
PERSON_NEOPLASM_CANCER_STATUS              24
RACE                                       30
RADIATION_THERAPY                           6
WEIGHT                                     20
DSS_STATUS                                  2
DFS_STATUS                                 83
DFS_MONTHS                                 83
dtype: int64

Rows with more than 80% the same value in a column:
Column: ICD_10
  Most frequent value: C54.1
  Count: 487/495 (98.4%)
  Unique values: 4
--------------------------------------------------
Column: ICD_O_3_SITE
  Most frequent value: C54.1
  Count: 487/495 (98.4%)
  Unique values: 4
--------------------------------------------------
Column: PERSON_NEOPLASM_CAN

In [33]:
categorical_columns = ["ICD_10", 
                        "PRIOR_DX", 
                        "RACE",
                        "RADIATION_THERAPY", 
                        "IN_PANCANPATHWAYS_FREEZE", 
                        "GENETIC_ANCESTRY_LABEL"] #FIXME: do further research on what ICD_10 and ICD_O_3_SITE are

# Fill numerical NaNs with median
numerical_df = df.select_dtypes(include=['number'])
numerical_df = numerical_df.drop(columns=['OS_MONTHS', 'DSS_MONTHS', 'DFS_MONTHS', 'PFS_MONTHS'])
numerical_df = numerical_df.fillna(numerical_df.median())

# fill catagorical columns with mode
categorical_df = df[categorical_columns]
categorical_df = categorical_df.fillna(categorical_df.mode())


# One-Hot Encode categorical columns (drop first to avoid redundancy)
categorical_df = pd.get_dummies(categorical_df, drop_first=True, dtype=float)

X = pd.concat([numerical_df, categorical_df], axis=1)

Saving my split of training and testing data

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=1, stratify=labels)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# joblib.dump(X_train, config.X_TRAIN_PATH)
# joblib.dump(X_test, config.X_TEST_PATH)
# joblib.dump(y_train, config.Y_TRAIN_PATH)
# joblib.dump(y_test, config.Y_TEST_PATH)
# joblib.dump(X.columns, config.FEATURE_NAMES)

(396, 17529) (99, 17529) (396,) (99,)
