UCEC Reccurance Notebook

Importing data from ucec_tcga_pan_can_atlas_2018

In [302]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier  # one model I'm just going to try first
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from collections import Counter



In [303]:

df = pd.DataFrame({'A': ['apple', 'banana', 'apple', 'orange', 'banana', 'apple', 'grape']})

# Get value counts
counts = df['A'].value_counts()

# Apply labeling only to duplicates
df['A'] = df.apply(lambda row: row['A'] if counts[row['A']] == 1 
                    else f"{row['A']}-{(df.groupby('A').cumcount() + 1)[row.name]}-of-{counts[row['A']]}",
                    axis=1)

print(df)


               A
0   apple-1-of-3
1  banana-1-of-2
2   apple-2-of-3
3         orange
4  banana-2-of-2
5   apple-3-of-3
6          grape


In [304]:
mrna_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep="\t", comment="#")
# I appear to have 527 patients in the mRNA and 529 patients in the clinical data

clinical_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018\data_clinical_patient.txt", sep="\t", comment="#", low_memory=False)
clinical_df = clinical_df.set_index('PATIENT_ID')

# The first 2 columns of the mRNA data are labels. 13 of the genes do not have Hugo_symbols, so I am putting placeholder stings as labels for these gense
missing_symbols = mrna_df['Hugo_Symbol'].isnull()
mrna_df.loc[missing_symbols, 'Hugo_Symbol'] = [
    f'no_symbol_{i+1}' for i in range(missing_symbols.sum())
]

# Get value counts
counts = mrna_df['Hugo_Symbol'].value_counts()

# Generate unique labels for duplicates
def label_duplicates(value, index):
    if counts[value] == 1:
        return value  # Keep unique values unchanged
    occurrence = mrna_df.groupby('Hugo_Symbol').cumcount() + 1  # Count occurrences per group
    return f"{value}-{occurrence[index]}-of-{counts[value]}"

# Apply the labeling function
mrna_df['Hugo_Symbol'] = [label_duplicates(value, idx) for idx, value in mrna_df['Hugo_Symbol'].items()]

mrna_df = mrna_df.set_index('Hugo_Symbol')
mrna_df = mrna_df.drop(columns="Entrez_Gene_Id") # removing the label column before I transpose the df
mrna_df_transposed= mrna_df.transpose()
mrna_df_transposed.index = [id[:-3] for id in mrna_df_transposed.index] # removes extranious -01 so that the patient ids match the clinical data

df = clinical_df.join(mrna_df_transposed, how='inner') # this is the data frame for clinical and genetic data. It has 527 patients (rows) and 20568 features (columns).

10072        LINC00205
10073        LOC642929
10074            SMIM5
10075        LOC643387
10076           BRDTP1
10077          CCDC168
10078         SCGB1B2P
10079    NKAIN3-1-of-2
10080        LINC01128
10081    ELMOD1-2-of-2
10082          ZNF733P
10083           EXOC1L
10084            BCRP3
Name: Hugo_Symbol, dtype: object


The following genes appear in the data more than once but have different data: 
['PALM2AKAP2', 'ELMOD1', 'FGF13', 'QSOX1', 'SNAP47', 'NKAIN3', 'TMEM8B']

Removing unecessary columns from clinical data.

In [305]:
MAX_NULL_VALS = 0.3

def assign_label(row):
    '''given a row assigns 1 for recurrance and 0 for no recurrance. 
    Currently uses NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT to identify recurrance.
    If NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT is NaN, uses DSF_STATUS'''
    if row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'Yes':
        return 1
    elif row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'No':
        return 0
    elif pd.isna(row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT']):
        if row['DFS_STATUS'] == '1:Recurred/Progressed':
            return 1
        elif row['DFS_STATUS'] == '0:DiseaseFree':
            return 0
        else:
            raise ValueError(f"Error: Both columns are NaN at index {row.name}")

# print(df.iloc(0))
# print(df.iloc[:, 0].tolist())


# for column_num in len(df.columns):
#     print(df.iloc[:, column_num].to_list())
#     cur_col = df.iloc[:, column_num].to_list()
#     print("-" * 50)
#     # if there is only one value for every patient, remove the column
#     if len(cur_col.dropna().unique()) <= 1:
#         df.drop([col], axis=1, inplace=True)

for col in df.columns:
    unique_values = df[col].nunique(dropna=True)
    if unique_values <= 1:
        # print(f"Removing column: {col}")
        df.drop(columns=[col], inplace=True)


# remove the column is over MAX_NULL_VALS percent null values
df.dropna(axis=1, thresh=len(df) * (1 - MAX_NULL_VALS))

# remove non-informational columns
df = df.drop(columns=['OTHER_PATIENT_ID'])

pair_counts = df.groupby(["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT", 'DFS_STATUS', 'PFS_STATUS'], dropna=False).size().reset_index(name='Count')

# Print the pairings and the count
print(pair_counts)

# I am going to remove the 32 rows where we have no recurrance label (neither NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT nor DFS_STATUS are known)
# PFS_STATUS
df = df.dropna(subset=['DFS_STATUS', 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'], how='all')

# # numpy array for the Labels for recurrance
labels = np.array(df.apply(assign_label, axis=1))

Removing column: CANCER_TYPE_ACRONYM
Removing column: SEX
Removing column: AJCC_PATHOLOGIC_TUMOR_STAGE
Removing column: DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS
Removing column: INFORMED_CONSENT_VERIFIED
Removing column: PATH_M_STAGE
Removing column: PATH_N_STAGE
Removing column: PATH_T_STAGE
Removing column: PRIMARY_LYMPH_NODE_PRESENTATION_ASSESSMENT
Removing column: no_symbol_4
Removing column: no_symbol_6
Removing column: SPATA31B1P
Removing column: REXO1L6P
Removing column: SDR16C6P
Removing column: PPBPP1
Removing column: AMELY
Removing column: BCORL2
Removing column: BPY2
Removing column: C11orf40
Removing column: C13orf28
Removing column: C14orf177
Removing column: C16orf78
Removing column: C17orf105
Removing column: C20orf71
Removing column: C20orf79
Removing column: C21orf54
Removing column: C9orf27
Removing column: CDY1
Removing column: CDY1B
Removing column: CSPG4P2Y
Removing column: CT47A10
Removing column: CT47A11
Removing column: CT47A6
Removing column: CT47A7
Removing column

Transforms data by changing catagorical data into numerical data and filling in missing data points with medians or modes.

In [306]:
categorical_columns = ["ETHNICITY",
                        "ICD_10", 
                        "PRIOR_DX", 
                        "RACE",
                        "RADIATION_THERAPY", 
                        "IN_PANCANPATHWAYS_FREEZE", 
                        "GENETIC_ANCESTRY_LABEL"] #FIXME: do further research on what ICD_10 and ICD_O_3_SITE are

#FIXME: I will need to go through the numerical categories and see if any need to be removed (because they represent data from after initial treatment)
# Fill numerical NaNs with median
numerical_df = df.select_dtypes(include=['number'])
numerical_df = numerical_df.drop(columns=['OS_MONTHS', 'DSS_MONTHS', 'DFS_MONTHS', 'PFS_MONTHS'])
numerical_df = numerical_df.fillna(numerical_df.median())

# fill catagorical columns with mode
categorical_df = df[categorical_columns]
categorical_df = categorical_df.fillna(categorical_df.mode())


# One-Hot Encode categorical columns (drop first to avoid redundancy)
categorical_df = pd.get_dummies(categorical_df, drop_first=True, dtype=float)
# comment, right now I'm making every column be numerical, may change some to boolean if that would also work

clinical_X = pd.concat([numerical_df, categorical_df], axis=1)
print(clinical_X.columns[15])
print(clinical_X.columns)
X = clinical_X.to_numpy()


no_symbol_7
Index(['AGE', 'AJCC_STAGING_EDITION', 'DAYS_TO_BIRTH', 'WEIGHT', 'no_symbol_1',
       'no_symbol_2', 'UBE2Q2P2', 'HMGB1P1', 'no_symbol_3', 'no_symbol_5',
       ...
       'RADIATION_THERAPY_Yes', 'IN_PANCANPATHWAYS_FREEZE_Yes',
       'GENETIC_ANCESTRY_LABEL_ADMIX', 'GENETIC_ANCESTRY_LABEL_AFR',
       'GENETIC_ANCESTRY_LABEL_AFR_ADMIX', 'GENETIC_ANCESTRY_LABEL_AMR',
       'GENETIC_ANCESTRY_LABEL_EAS', 'GENETIC_ANCESTRY_LABEL_EUR',
       'GENETIC_ANCESTRY_LABEL_EUR_ADMIX', 'GENETIC_ANCESTRY_LABEL_SAS'],
      dtype='object', length=20044)


In [308]:
# we should have an X and y now, yay!. Lets do some machine learning 
print("Data shape: ", X.shape)
print("Labels shape: ", labels.shape)


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

print(y_test)

model = RandomForestClassifier(random_state=100)
# will add a k-fold cross validation, just have random numbers for now
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)  # Ensures balanced class splits

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',  # Change to 'f1', 'roc_auc', etc., if needed
    n_jobs=-1  # Use all CPU cores
)

grid_search.fit(X_train, y_train)  # Train with cross-validation

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_  # Retrieve the best model

y_test_pred = best_model.predict(X_test)

cm_test = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm_test.ravel()
print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"True Negatives (TN): {tn}")
print(f"False Negatives (FN): {fn}")

# Obtain predicted probabilities for the positive class
y_test_proba = best_model.predict_proba(X_test)[:, 1] # FIXME: I don't understand what this is doing

# Compute the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_test_proba)

# Print the AUC-ROC score
print(f"AUC-ROC Score: {auc_roc:.4f}")

Data shape:  (495, 20044)
Labels shape:  (495,)
[1 0 1 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1]
Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.6657116515325472
True Positives (TP): 0
False Positives (FP): 0
True Negatives (TN): 78
False Negatives (FN): 21
AUC-ROC Score: 0.7131
