UCEC Reccurance Notebook

Importing data from ucec_tcga_pan_can_atlas_2018

In [126]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier  # one model I'm just going to try first



Removing unecessary columns from clinical data.

In [127]:
MAX_NULL_VALS = 0.3

df = pd.read_csv("ucec_tcga_pan_can_atlas_2018\data_clinical_patient.txt", sep="\t", comment="#", low_memory=False)

def assign_label(row):
    '''given a row assigns 1 for recurrance and 0 for no recurrance. 
    Currently uses NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT to identify recurrance.
    If NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT is NaN, uses DSF_STATUS'''
    if row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'Yes':
        return 1
    elif row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'No':
        return 0
    elif pd.isna(row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT']):
        if row['DFS_STATUS'] == '1:Recurred/Progressed':
            return 1
        elif row['DFS_STATUS'] == '0:DiseaseFree':
            return 0
        else:
            raise ValueError(f"Error: Both columns are NaN at index {row.name}")

print(df.columns)

for col in df.columns:
    # print(f"number of unique values in {col}: {len(df[col].unique())}")  # Show only first 5 unique values
    # print("-" * 50)
    # if there is only one value for every patient, remove the column
    if len(df[col].dropna().unique()) <= 1:
        df.drop([col], axis=1, inplace=True)

# remove the column is over MAX_NULL_VALS percent null values
df.dropna(axis=1, thresh=len(df) * (1 - MAX_NULL_VALS))

# remove non-informational columns
df = df.drop(columns=['PATIENT_ID', 'OTHER_PATIENT_ID'])

pair_counts = df.groupby(["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT", 'DFS_STATUS'], dropna=False).size().reset_index(name='Count')

# Print the pairings and the count
print(pair_counts)

# # I am going to remove the 32 rows where we have no recurrance label (neither NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT nor DFS_STATUS are known)
df = df.dropna(subset=['DFS_STATUS', 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'], how='all')

# # numpy array for the Labels for recurrance
labels = np.array(df.apply(assign_label, axis=1))

Index(['PATIENT_ID', 'SUBTYPE', 'CANCER_TYPE_ACRONYM', 'OTHER_PATIENT_ID',
       'AGE', 'SEX', 'AJCC_PATHOLOGIC_TUMOR_STAGE', 'AJCC_STAGING_EDITION',
       'DAYS_LAST_FOLLOWUP', 'DAYS_TO_BIRTH',
       'DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS', 'ETHNICITY',
       'FORM_COMPLETION_DATE', 'HISTORY_NEOADJUVANT_TRTYN', 'ICD_10',
       'ICD_O_3_HISTOLOGY', 'ICD_O_3_SITE', 'INFORMED_CONSENT_VERIFIED',
       'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT', 'PATH_M_STAGE',
       'PATH_N_STAGE', 'PATH_T_STAGE', 'PERSON_NEOPLASM_CANCER_STATUS',
       'PRIMARY_LYMPH_NODE_PRESENTATION_ASSESSMENT', 'PRIOR_DX', 'RACE',
       'RADIATION_THERAPY', 'WEIGHT', 'IN_PANCANPATHWAYS_FREEZE', 'OS_STATUS',
       'OS_MONTHS', 'DSS_STATUS', 'DSS_MONTHS', 'DFS_STATUS', 'DFS_MONTHS',
       'PFS_STATUS', 'PFS_MONTHS', 'GENETIC_ANCESTRY_LABEL'],
      dtype='object')
  NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT             DFS_STATUS  Count
0                                      No          0:DiseaseFree    328
1     

Transforms data by changing catagorical data into numerical data and filling in missing data points with medians or modes.

In [128]:
categorical_columns = ["ETHNICITY",
                        "ICD_10", 
                        "PRIOR_DX", 
                        "RACE",
                        "RADIATION_THERAPY", 
                        "IN_PANCANPATHWAYS_FREEZE", 
                        "PFS_STATUS", 
                        "GENETIC_ANCESTRY_LABEL"] # do further research on what ICD_10 and ICD_O_3_SITE are

#FIXME: I will need to go through the numerical categories and see if any need to be removed (because they represent data from after initial treatment)
# Fill numerical NaNs with median
numerical_df = df.select_dtypes(include=['number'])
numerical_df = numerical_df.drop(columns=['OS_MONTHS', 'DSS_MONTHS', 'DFS_MONTHS', 'PFS_MONTHS'])
numerical_df = numerical_df.fillna(numerical_df.median())

# fill catagorical columns with mode
categorical_df = df[categorical_columns]
categorical_df = categorical_df.fillna(categorical_df.mode())


# One-Hot Encode categorical columns (drop first to avoid redundancy)
categorical_df = pd.get_dummies(categorical_df, drop_first=True, dtype=float)
# comment, right now I'm making every column be numerical, may change some to boolean if that would also work

clinical_X = pd.concat([numerical_df, categorical_df], axis=1)
print(clinical_X.columns)
X = df.to_numpy()


Index(['AGE', 'AJCC_STAGING_EDITION', 'DAYS_TO_BIRTH', 'WEIGHT',
       'ETHNICITY_Not Hispanic Or Latino', 'ICD_10_C54.1', 'ICD_10_C54.3',
       'ICD_10_C54.9', 'PRIOR_DX_Yes', 'RACE_Asian',
       'RACE_Black or African American',
       'RACE_Native Hawaiian or Other Pacific Islander', 'RACE_White',
       'RADIATION_THERAPY_Yes', 'IN_PANCANPATHWAYS_FREEZE_Yes',
       'PFS_STATUS_1:PROGRESSION', 'GENETIC_ANCESTRY_LABEL_ADMIX',
       'GENETIC_ANCESTRY_LABEL_AFR', 'GENETIC_ANCESTRY_LABEL_AFR_ADMIX',
       'GENETIC_ANCESTRY_LABEL_AMR', 'GENETIC_ANCESTRY_LABEL_EAS',
       'GENETIC_ANCESTRY_LABEL_EUR', 'GENETIC_ANCESTRY_LABEL_EUR_ADMIX',
       'GENETIC_ANCESTRY_LABEL_SAS'],
      dtype='object')


In [129]:
# we should have an X and y now, yay!. Lets do some machine learning 
print("Data shape: ", X.shape)
print("Labels shape: ", labels.shape)

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2)
print(X_train)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = RandomForestClassifier(random_state=42)

# will add a k-fold cross validation, just have random numbers for now
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Ensures balanced class splits

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',  # Change to 'f1', 'roc_auc', etc., if needed
    n_jobs=-1  # Use all CPU cores
)

grid_search.fit(X_train, y_train)  # Train with cross-validation

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_  # Retrieve the best model




Data shape:  (497, 27)
Labels shape:  (497,)
[['UCEC_POLE' 40.0 2009.0 ... '0:CENSORED' 60.22947694 'EUR']
 ['UCEC_CN_LOW' 76.0 2009.0 ... '0:CENSORED' 51.08985107 'EUR']
 ['UCEC_POLE' 65.0 1988.0 ... '0:CENSORED' 85.84015518 'EAS']
 ...
 ['UCEC_CN_LOW' 31.0 2009.0 ... '0:CENSORED' 33.07361015 'EAS']
 [nan 64.0 2009.0 ... '0:CENSORED' 11.21083605 'AFR']
 ['UCEC_CN_HIGH' 83.0 2009.0 ... '1:PROGRESSION' 10.19166913 'EUR']]


ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 360, in fit
    X, y = validate_data(
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'UCEC_CN_LOW'

--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 360, in fit
    X, y = validate_data(
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\gench\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'UCEC_POLE'
