UCEC Reccurance Notebook

Importing data from ucec_tcga_pan_can_atlas_2018

In [226]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier  # one model I'm just going to try first
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score



In [257]:
mrna_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep="\t", comment="#")
# I appear to have 527 patients in the mRNA and 529 patients in the clinical data

clinical_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018\data_clinical_patient.txt", sep="\t", comment="#", low_memory=False)
clinical_df = clinical_df.set_index('PATIENT_ID')

# The first 2 columns of the mRNA data are labels. 13 of the genes do not have Hugo_symbols, so I am putting placeholder stings as labels for these gense
missing_symbols = mrna_df['Hugo_Symbol'].isnull()
mrna_df.loc[missing_symbols, 'Hugo_Symbol'] = [
    f'no_symbol_{i+1}' for i in range(missing_symbols.sum())
]

mrna_df = mrna_df.set_index('Hugo_Symbol')
mrna_df = mrna_df.drop(columns="Entrez_Gene_Id") # removing the label column before I transpose the df
mrna_df_transposed= mrna_df.transpose()
mrna_df_transposed.index = [id[:-3] for id in mrna_df_transposed.index] # removes extranious -01 so that the patient ids match the clinical data

df = clinical_df.join(mrna_df_transposed, how='inner') # this is the data frame for clinical and genetic data. It has 527 patients (rows) and 20568 features (columns).

<bound method NDFrame.head of                    SUBTYPE CANCER_TYPE_ACRONYM  \
TCGA-2E-A9G8  UCEC_CN_HIGH                UCEC   
TCGA-4E-A92E   UCEC_CN_LOW                UCEC   
TCGA-5B-A90C  UCEC_CN_HIGH                UCEC   
TCGA-5S-A9Q8   UCEC_CN_LOW                UCEC   
TCGA-A5-A0G1     UCEC_POLE                UCEC   
...                    ...                 ...   
TCGA-QS-A8F1  UCEC_CN_HIGH                UCEC   
TCGA-SJ-A6ZI      UCEC_MSI                UCEC   
TCGA-SJ-A6ZJ   UCEC_CN_LOW                UCEC   
TCGA-SL-A6J9  UCEC_CN_HIGH                UCEC   
TCGA-SL-A6JA      UCEC_MSI                UCEC   

                                  OTHER_PATIENT_ID   AGE     SEX  \
TCGA-2E-A9G8  9583C10B-B21A-4863-98FA-61E735E64EA5  59.0  Female   
TCGA-4E-A92E  B43DE98D-4BB1-41A1-91F2-CE4E7EA4D0CA  54.0  Female   
TCGA-5B-A90C  16AC4341-CF8F-45E2-B90B-2D12D5F74A59  69.0  Female   
TCGA-5S-A9Q8  8751429B-4A11-451E-B978-DC9E9DB0EB36  51.0  Female   
TCGA-A5-A0G1  53707bb3-426a-4

Removing unecessary columns from clinical data.

In [255]:
MAX_NULL_VALS = 0.3

def assign_label(row):
    '''given a row assigns 1 for recurrance and 0 for no recurrance. 
    Currently uses NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT to identify recurrance.
    If NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT is NaN, uses DSF_STATUS'''
    if row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'Yes':
        return 1
    elif row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'No':
        return 0
    elif pd.isna(row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT']):
        if row['DFS_STATUS'] == '1:Recurred/Progressed':
            return 1
        elif row['DFS_STATUS'] == '0:DiseaseFree':
            return 0
        else:
            raise ValueError(f"Error: Both columns are NaN at index {row.name}")

for col in df.columns:
    print(df[col])
    print(f"number of unique values in {col}: {len(df[col].unique())}")  # Show only first 5 unique values
    print("-" * 50)
    # if there is only one value for every patient, remove the column
    if len(df[col].dropna().unique()) <= 1:
        df.drop([col], axis=1, inplace=True)

# remove the column is over MAX_NULL_VALS percent null values
df.dropna(axis=1, thresh=len(df) * (1 - MAX_NULL_VALS))

# remove non-informational columns
df = df.drop(columns=['OTHER_PATIENT_ID'])

pair_counts = df.groupby(["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT", 'DFS_STATUS', 'PFS_STATUS'], dropna=False).size().reset_index(name='Count')

# Print the pairings and the count
print(pair_counts)

# I am going to remove the 32 rows where we have no recurrance label (neither NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT nor DFS_STATUS are known)
# PFS_STATUS
df = df.dropna(subset=['DFS_STATUS', 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'], how='all')

# # numpy array for the Labels for recurrance
labels = np.array(df.apply(assign_label, axis=1))

TCGA-2E-A9G8    UCEC_CN_HIGH
TCGA-4E-A92E     UCEC_CN_LOW
TCGA-5B-A90C    UCEC_CN_HIGH
TCGA-5S-A9Q8     UCEC_CN_LOW
TCGA-A5-A0G1       UCEC_POLE
                    ...     
TCGA-QS-A8F1    UCEC_CN_HIGH
TCGA-SJ-A6ZI        UCEC_MSI
TCGA-SJ-A6ZJ     UCEC_CN_LOW
TCGA-SL-A6J9    UCEC_CN_HIGH
TCGA-SL-A6JA        UCEC_MSI
Name: SUBTYPE, Length: 527, dtype: object
number of unique values in SUBTYPE: 5
--------------------------------------------------
TCGA-2E-A9G8    UCEC
TCGA-4E-A92E    UCEC
TCGA-5B-A90C    UCEC
TCGA-5S-A9Q8    UCEC
TCGA-A5-A0G1    UCEC
                ... 
TCGA-QS-A8F1    UCEC
TCGA-SJ-A6ZI    UCEC
TCGA-SJ-A6ZJ    UCEC
TCGA-SL-A6J9    UCEC
TCGA-SL-A6JA    UCEC
Name: CANCER_TYPE_ACRONYM, Length: 527, dtype: object
number of unique values in CANCER_TYPE_ACRONYM: 1
--------------------------------------------------
TCGA-2E-A9G8    9583C10B-B21A-4863-98FA-61E735E64EA5
TCGA-4E-A92E    B43DE98D-4BB1-41A1-91F2-CE4E7EA4D0CA
TCGA-5B-A90C    16AC4341-CF8F-45E2-B90B-2D12D5F74A59
TCGA-

AttributeError: 'DataFrame' object has no attribute 'unique'

Transforms data by changing catagorical data into numerical data and filling in missing data points with medians or modes.

In [182]:
categorical_columns = ["ETHNICITY",
                        "ICD_10", 
                        "PRIOR_DX", 
                        "RACE",
                        "RADIATION_THERAPY", 
                        "IN_PANCANPATHWAYS_FREEZE", 
                        "GENETIC_ANCESTRY_LABEL"] #FIXME: do further research on what ICD_10 and ICD_O_3_SITE are

#FIXME: I will need to go through the numerical categories and see if any need to be removed (because they represent data from after initial treatment)
# Fill numerical NaNs with median
numerical_df = clinical_df.select_dtypes(include=['number'])
numerical_df = numerical_df.drop(columns=['OS_MONTHS', 'DSS_MONTHS', 'DFS_MONTHS', 'PFS_MONTHS'])
numerical_df = numerical_df.fillna(numerical_df.median())

# fill catagorical columns with mode
categorical_df = clinical_df[categorical_columns]
categorical_df = categorical_df.fillna(categorical_df.mode())


# One-Hot Encode categorical columns (drop first to avoid redundancy)
categorical_df = pd.get_dummies(categorical_df, drop_first=True, dtype=float)
# comment, right now I'm making every column be numerical, may change some to boolean if that would also work

clinical_X = pd.concat([numerical_df, categorical_df], axis=1)
print(clinical_X.columns[15])
print(clinical_X.columns)
X = clinical_X.to_numpy()


GENETIC_ANCESTRY_LABEL_ADMIX
Index(['AGE', 'AJCC_STAGING_EDITION', 'DAYS_TO_BIRTH', 'WEIGHT',
       'ETHNICITY_Not Hispanic Or Latino', 'ICD_10_C54.1', 'ICD_10_C54.3',
       'ICD_10_C54.9', 'PRIOR_DX_Yes', 'RACE_Asian',
       'RACE_Black or African American',
       'RACE_Native Hawaiian or Other Pacific Islander', 'RACE_White',
       'RADIATION_THERAPY_Yes', 'IN_PANCANPATHWAYS_FREEZE_Yes',
       'GENETIC_ANCESTRY_LABEL_ADMIX', 'GENETIC_ANCESTRY_LABEL_AFR',
       'GENETIC_ANCESTRY_LABEL_AFR_ADMIX', 'GENETIC_ANCESTRY_LABEL_AMR',
       'GENETIC_ANCESTRY_LABEL_EAS', 'GENETIC_ANCESTRY_LABEL_EUR',
       'GENETIC_ANCESTRY_LABEL_EUR_ADMIX', 'GENETIC_ANCESTRY_LABEL_SAS'],
      dtype='object')


In [None]:
# we should have an X and y now, yay!. Lets do some machine learning 
print("Data shape: ", X.shape)
print("Labels shape: ", labels.shape)


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = RandomForestClassifier(random_state=100)
# will add a k-fold cross validation, just have random numbers for now
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)  # Ensures balanced class splits

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',  # Change to 'f1', 'roc_auc', etc., if needed
    n_jobs=-1  # Use all CPU cores
)

grid_search.fit(X_train, y_train)  # Train with cross-validation

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_  # Retrieve the best model

y_test_pred = best_model.predict(X_test)

cm_test = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm_test.ravel()
print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"True Negatives (TN): {tn}")
print(f"False Negatives (FN): {fn}")

# Obtain predicted probabilities for the positive class
y_test_proba = best_model.predict_proba(X_test)[:, 1] # FIXME: I don't understand what this is doing

# Compute the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_test_proba)

# Print the AUC-ROC score
print(f"AUC-ROC Score: {auc_roc:.4f}")

Data shape:  (497, 23)
Labels shape:  (497,)
Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
Best Score: 0.5131968641114982
True Positives (TP): 0
False Positives (FP): 0
True Negatives (TN): 205
False Negatives (FN): 44
AUC-ROC Score: 0.5055
