## UCEC Reccurence Notebook - Pre-processing
This program processes data from ucec_tcga_pan_can_atlas_2018 to be ready for training a machine learning algorithm to predict recurrence. 

In [18]:
import config

import pandas as pd
import numpy as np
import joblib
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [19]:
# Loading in the mRNA and clinical data:
mrna_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep="\t", comment="#")

clinical_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018\data_clinical_patient.txt", sep="\t", comment="#", low_memory=False)
clinical_df = clinical_df.set_index('PATIENT_ID')
# There are 527 patients in the mRNA and 529 patients in the clinical data

# The first 2 columns of the mRNA data are labels (Hugo_Symbol then Entrez_Gene_Id). 
# 13 of the genes do not have Hugo_symbols, so for these I will you the Entrex_Gene_Id as the label.
missing_symbols = mrna_df['Hugo_Symbol'].isnull()
mrna_df.loc[missing_symbols, 'Hugo_Symbol'] = mrna_df.loc[missing_symbols, 'Entrez_Gene_Id'].astype(str)

# There are 7 rows that have both the same Hugo_Symbol and Entrez_Gene_Id but different values for the patients.
# I will rename these rows to have unique labels by appending -1-of-2 and -2-of-2 to the Hugo_Symbol.
# Get value counts
counts = mrna_df['Hugo_Symbol'].value_counts()

# Generate unique labels for duplicates
def label_duplicates(value, index):
    if counts[value] == 1:
        return value  # Keep unique values unchanged
    occurrence = mrna_df.groupby('Hugo_Symbol').cumcount() + 1  # Count occurrences per group
    return f"{value}-{occurrence[index]}-of-{counts[value]}"

# Apply the labeling function
mrna_df['Hugo_Symbol'] = [label_duplicates(value, idx) for idx, value in mrna_df['Hugo_Symbol'].items()]

mrna_df = mrna_df.set_index('Hugo_Symbol')
mrna_df = mrna_df.drop(columns="Entrez_Gene_Id") # removing the label column before I transpose the df
mrna_df= mrna_df.transpose() # now the patients are the index and the genes are the columns
mrna_df.index = [id[:-3] for id in mrna_df.index] # removes extranious -01 so that the patient ids match the clinical data

def drop_patients_missing_data(clinical_df, mrna_df):
    # Find patient IDs not shared between the two dataframes:
    clinical_not_in_mrna = set(clinical_df.index) - set(mrna_df.index)
    mrna_not_in_clinical = set(mrna_df.index) - set(clinical_df.index)
    # There are 2 patients ('TCGA-EY-A1GJ', 'TCGA-AP-A0LQ') in the clinical data that are not in the mRNA data.
    clinical_df = clinical_df.drop(index=clinical_not_in_mrna)
    mrna_df = mrna_df.drop(index=mrna_not_in_clinical)
    return clinical_df, mrna_df

clinical_df, mrna_df = drop_patients_missing_data(clinical_df, mrna_df)
# Now both dataframes have 527 patients

In [20]:
# Data frame for clinical and genetic data. It has 527 patients (rows) and 20568 features (columns).
df = clinical_df.join(mrna_df, how='inner')

# Check that all column labels in mrna_df are strings
non_str_cols = [col for col in df.columns if not isinstance(col, str)]
if non_str_cols: 
    raise ValueError(f"Non-string column labels found: {non_str_cols}")

In [21]:
df.loc["TCGA-BK-A139"][:55] # example patient data for a patient with recurrence This is the only patient that I am postive experienced recurrence

SUBTYPE                                                               UCEC_CN_HIGH
CANCER_TYPE_ACRONYM                                                           UCEC
OTHER_PATIENT_ID                              8d9e4917-334b-4c76-aee1-1e22be772db0
AGE                                                                           74.0
SEX                                                                         Female
AJCC_PATHOLOGIC_TUMOR_STAGE                                                    NaN
AJCC_STAGING_EDITION                                                           NaN
DAYS_LAST_FOLLOWUP                                                             309
DAYS_TO_BIRTH                                                             -27077.0
DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS                                           0.0
ETHNICITY                                                   Not Hispanic Or Latino
FORM_COMPLETION_DATE                                                       2/23/11
HIST

The following genes appear in the data more than once but have different data: 
['PALM2AKAP2', 'ELMOD1', 'FGF13', 'QSOX1', 'SNAP47', 'NKAIN3', 'TMEM8B']
Right now, I'm leaving every version of the gene in, but giving placeholder unique names like: PALM2AKAP2-1-of-2 and PALM2AKAP2-2-of-2

Removing unecessary columns from clinical data.

In [22]:
# testing to see if this informationless column is removed in the pre processing (it should be)
nonzero_patients = df[df["DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS"] != 0]
print(nonzero_patients["DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS"])


TCGA-AJ-A3BH   NaN
TCGA-DF-A2KN   NaN
TCGA-DF-A2KR   NaN
TCGA-DF-A2KS   NaN
TCGA-DF-A2KU   NaN
TCGA-DF-A2L0   NaN
Name: DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS, dtype: float64


In [23]:

def assign_label(row):
    '''given a row assigns 1 for recurrance, 0 for no recurrance, 
    and None if the patient has no recurrence information. 
    Currently uses NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT to identify recurrance.
    If NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT is NaN, uses DSF_STATUS to save the label.'''
    if row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'Yes':
        return 1
    elif row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'No':
        return 0
    elif pd.isna(row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT']):
        if row['DFS_STATUS'] == '1:Recurred/Progressed':
            return 1
        elif row['DFS_STATUS'] == '0:DiseaseFree':
            return 0
        else:
            return None

def drop_highly_uniform_columns(df, threshold=0.99):
    """
    Drops columns where more than 'threshold' proportion of non-NaN values are the same.

    Parameters:
    - df: pandas DataFrame
    - threshold: float (default 0.99), proportion threshold to drop columns

    Returns:
    - pandas DataFrame with specified columns dropped
    """
    cols_to_drop = []
    for col in df.columns:
        non_na_values = df[col].dropna()
        if not non_na_values.empty:
            top_freq = non_na_values.value_counts(normalize=True).iloc[0]
            if top_freq > threshold:
                cols_to_drop.append(col)
    return df.drop(columns=cols_to_drop)


In [24]:
# Drop rows with no recurrence label
df = df.dropna(
    subset=["DFS_STATUS", "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"],
    how="all"
)

# Labels array for recurrence
labels = np.array(df.apply(assign_label, axis=1))
label_counts = Counter(labels)


In [25]:
# from collections import Counter
# import numpy as np

# # Group counts for label-related columns
# pair_counts = (
#     df.groupby(
#         ["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT", "DFS_STATUS", "PFS_STATUS"],
#         dropna=False
#     )
#     .size()
#     .reset_index(name="Count")
# )

# # Nicely print the pairings and counts
# print("\n=== Pair Counts (Before Dropping Missing Labels) ===")
# print(pair_counts.to_string(index=False))


# # Pretty print label distribution
# print("\n=== Label Distribution (After Cleaning) ===")
# total = sum(label_counts.values())
# for label, count in label_counts.items():
#     pct = (count / total) * 100
#     print(f"{label:12} : {count:4} ({pct:5.1f}%)")


In [26]:
print(df.shape)
print(labels.shape)

(495, 20568)
(495,)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=1, stratify=labels)

In [28]:
# remove the column if over MAX_NULL_VALS percent null values
print("length before removing null-heavy columns:", len(X_train.columns))
original_column_set = set(X_train.columns)
X_train = X_train.dropna(axis=1, thresh=len(X_train) * (1 - config.MAX_NULL_VALS))
print("length after removing null-heavy columns:", len(X_train.columns))
print(original_column_set - set(X_train.columns)) # checking which columns were removed

# remove columns where over 99% of the non-null values are the same
X_train = drop_highly_uniform_columns(X_train)


# remove non-informational columns
X_train = X_train.drop(columns=['OTHER_PATIENT_ID'])

length before removing null-heavy columns: 20568
length after removing null-heavy columns: 17538
{'ZNRF4', 'OR4K13', 'C4orf6', 'OTOS', 'OR6B3', 'PXT1', 'TTTY6B', 'PRG1', 'OR6C2', 'ADAM3A', 'DEFB108B', 'FAM75A6', 'H2BFM', 'NMS', 'NTSR2', 'SNORA11', 'CDK15', 'HRH3', 'SNORD116-29', 'FAM75A3', 'GOLGA6FP', 'NAA11', 'KRTAP10-11', 'BARHL1', 'SNORA2A', 'SNORD4A', 'SNORD19', 'CCL27', 'GABRA5', 'SNORD25', 'SNORD11B', 'SNORA52', 'LINC01845', 'ALLC', 'SEMG2', 'IL28B', 'APOL5', 'CCDC105', 'NCRNA00099', 'HIST1H1T', 'PASD1', 'SNORD115-17', 'ELMOD1-2-of-2', 'PDILT', 'DPPA3', 'AVP', 'CNTFR-AS1', 'SPANXB2', 'SLC6A18', 'RTL1', 'MYOG', 'COX7B2', 'MSGN1', 'DAZ4', 'NCRNA00161', 'KRT76', 'SNORD27', 'ZNF479', 'LELP1', 'C1orf158', 'ZFHX4-AS1', 'HFE2', 'IL31', 'IFNK', 'CNTNAP4', 'GJA10', 'SNORD36C', 'OR9K2', 'SRRM4', 'KRTAP3-1', 'LINGO3', 'SLC5A8', 'C20orf123', 'OR4D5', 'MCCD1', 'REXO1L1', 'SSX9P', 'HTR1E', 'GPR31', 'GPR142', 'DUX4L2', 'OR9A2', 'NPFFR2', 'PDE6H', 'SNORD115-32', 'PSMB11', 'CDH18', 'LHX9', 'TSGA1

In [29]:
# Just used for looking at the data for the labels
pair_counts = X_train.groupby(["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT", 'DFS_STATUS', "PFS_STATUS"], dropna=False).size().reset_index(name='Count')

# Print the pairings and the count
print(pair_counts)

# Removes the 32 rows where we have no recurrance label (neither NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT nor DFS_STATUS are known)
X_train = X_train.dropna(subset=['DFS_STATUS', 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'], how='all')

# numpy array for the labels for recurrance
labels = np.array(X_train.apply(assign_label, axis=1)) 
print(Counter(labels))


  NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT             DFS_STATUS  \
0                                      No          0:DiseaseFree   
1                                      No          0:DiseaseFree   
2                                      No  1:Recurred/Progressed   
3                                      No                    NaN   
4                                      No                    NaN   
5                                     Yes          0:DiseaseFree   
6                                     Yes  1:Recurred/Progressed   
7                                     Yes                    NaN   
8                                     NaN          0:DiseaseFree   
9                                     NaN  1:Recurred/Progressed   

      PFS_STATUS  Count  
0     0:CENSORED    258  
1  1:PROGRESSION      1  
2  1:PROGRESSION     11  
3     0:CENSORED     31  
4  1:PROGRESSION     11  
5  1:PROGRESSION      7  
6  1:PROGRESSION     31  
7  1:PROGRESSION     26  
8     0:CENSORED 

Transforms data by changing catagorical data into numerical data and filling in missing data points with medians or modes.

In [30]:
# Testing  that pre-processing worked as expected ##########################################################

# finding columns with NaN values after dropping some columns and rows
nan_counts = X_train.isnull().sum()
nonzero_nans = nan_counts[nan_counts > 0]
print("Columns with NaN values:")
print(nonzero_nans)
print("")

print("Rows with more than 80% the same value in a column:")
def check_dominant_columns(X_train, threshold=0.8):
    """
    Print columns where the most frequent value accounts for at least
    `threshold` fraction of the entries.

    Parameters
    ----------
    X_train : pandas DataFrame
        The dataset to check.
    threshold : float (default=0.8)
        Proportion cutoff for dominance.
    """
    n_rows = len(X_train)
    for col in X_train.columns:
        # Get the most frequent value count
        top_value_count = X_train[col].value_counts(dropna=False).iloc[0]
        proportion = top_value_count / n_rows

        if proportion >= threshold:
            top_value = X_train[col].value_counts(dropna=False).idxmax()
            print(f"Column: {col}")
            print(f"  Most frequent value: {top_value}")
            print(f"  Count: {top_value_count}/{n_rows} ({proportion:.1%})")
            print(f"  Unique values: {X_train[col].nunique(dropna=False)}")
            print("-" * 50)

check_dominant_columns(X_train, threshold=0.8)

Columns with NaN values:
SUBTYPE                                    17
AGE                                         1
AJCC_STAGING_EDITION                       53
DAYS_LAST_FOLLOWUP                         25
DAYS_TO_BIRTH                               2
NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT    20
PERSON_NEOPLASM_CANCER_STATUS              18
RACE                                       21
RADIATION_THERAPY                           3
WEIGHT                                     13
DSS_STATUS                                  1
DFS_STATUS                                 68
DFS_MONTHS                                 68
dtype: int64

Rows with more than 80% the same value in a column:
Column: ICD_10
  Most frequent value: C54.1
  Count: 390/396 (98.5%)
  Unique values: 4
--------------------------------------------------
Column: ICD_O_3_SITE
  Most frequent value: C54.1
  Count: 390/396 (98.5%)
  Unique values: 4
--------------------------------------------------
Column: PERSON_NEOPLASM_CAN

In [31]:
categorical_columns = ["ICD_10", 
                        "PRIOR_DX", 
                        "RACE",
                        "RADIATION_THERAPY", 
                        "IN_PANCANPATHWAYS_FREEZE", 
                        "GENETIC_ANCESTRY_LABEL"] #FIXME: do further research on what ICD_10 and ICD_O_3_SITE are

# # Fill numerical NaNs with median
# numerical_df = X_train.select_dtypes(include=['number'])
# numerical_df = numerical_df.drop(columns=['OS_MONTHS', 'DSS_MONTHS', 'DFS_MONTHS', 'PFS_MONTHS'])
# numerical_df = numerical_df.fillna(numerical_df.median())

# # fill catagorical columns with mode
# categorical_df = X_train[categorical_columns]
# categorical_df = categorical_df.fillna(categorical_df.mode())


# # One-Hot Encode categorical columns (drop first to avoid redundancy)
# categorical_df = pd.get_dummies(categorical_df, drop_first=True, dtype=float)



In [32]:
# --- Numerical ---
numerical_df = X_train.select_dtypes(include=['number']).copy()
numerical_df = numerical_df.drop(columns=['OS_MONTHS', 'DSS_MONTHS', 'DFS_MONTHS', 'PFS_MONTHS'], errors="ignore")

# Save medians for test-time
num_medians = numerical_df.median()
numerical_df = numerical_df.fillna(num_medians)

# --- Categorical ---
categorical_df = X_train[categorical_columns].copy()

# Save modes for test-time
cat_modes = categorical_df.mode().iloc[0]   # mode() returns a DataFrame, take first row
categorical_df = categorical_df.fillna(cat_modes)

# One-Hot Encode (drop first to avoid redundancy)
categorical_df = pd.get_dummies(categorical_df, drop_first=True, dtype=float)

X_train = pd.concat([numerical_df, categorical_df], axis=1)


In [None]:


numerical_test = X_test.select_dtypes(include=['number']).copy()
numerical_test = numerical_test.fillna(num_medians)  # <- use train medians

categorical_test = X_test[categorical_columns].copy()
categorical_test = categorical_test.fillna(cat_modes)  # <- use train modes
categorical_test = pd.get_dummies(categorical_test, drop_first=True, dtype=float)

# Align test with training columns
categorical_test = categorical_test.reindex(columns=categorical_df.columns, fill_value=0)



# Combine back
X_test = pd.concat([numerical_test, categorical_test], axis=1)

# X_test, y_train, y_test need to be preprocessed the same way as X_train
X_test = X_test[X_train.columns]




Saving my split of training and testing data

In [35]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(396, 17529) (99, 17529) (396,) (99,)


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=1, stratify=labels)

# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# joblib.dump(X_train, "og_preproc_data_rerun/X_train.pkl")
# joblib.dump(X_test, "og_preproc_data_rerun/X_test.pkl")
# joblib.dump(y_train, "og_preproc_data_rerun/y_train.pkl")
# joblib.dump(y_test, "og_preproc_data_rerun/y_test.pkl")
# # joblib.dump(X.columns, config.FEATURE_NAMES)

['og_preproc_data_rerun/y_test.pkl']