UCEC Reccurance Notebook - Pre-processing

Importing data from ucec_tcga_pan_can_atlas_2018

In [2]:
import pandas as pd
import numpy as np
import joblib
import config

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# TODO: Impliment these models
from xgboost import XGBClassifier
from sklearn.svm import SVC


from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel

from sklearn.pipeline import Pipeline

from collections import Counter
import matplotlib.pyplot as plt


In [3]:
mrna_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep="\t", comment="#")
# I appear to have 527 patients in the mRNA and 529 patients in the clinical data

clinical_df = pd.read_csv("ucec_tcga_pan_can_atlas_2018\data_clinical_patient.txt", sep="\t", comment="#", low_memory=False)
clinical_df = clinical_df.set_index('PATIENT_ID')

# The first 2 columns of the mRNA data are labels. 13 of the genes do not have Hugo_symbols, so I am putting placeholder stings as labels for these genes
missing_symbols = mrna_df['Hugo_Symbol'].isnull()
mrna_df.loc[missing_symbols, 'Hugo_Symbol'] = [
    f'no_symbol_{i+1}' for i in range(missing_symbols.sum())
]

# Get value counts
counts = mrna_df['Hugo_Symbol'].value_counts()

# Generate unique labels for duplicates
def label_duplicates(value, index):
    if counts[value] == 1:
        return value  # Keep unique values unchanged
    occurrence = mrna_df.groupby('Hugo_Symbol').cumcount() + 1  # Count occurrences per group
    return f"{value}-{occurrence[index]}-of-{counts[value]}"

# Apply the labeling function
mrna_df['Hugo_Symbol'] = [label_duplicates(value, idx) for idx, value in mrna_df['Hugo_Symbol'].items()]

mrna_df = mrna_df.set_index('Hugo_Symbol')
mrna_df = mrna_df.drop(columns="Entrez_Gene_Id") # removing the label column before I transpose the df
mrna_df_transposed= mrna_df.transpose()
mrna_df_transposed.index = [id[:-3] for id in mrna_df_transposed.index] # removes extranious -01 so that the patient ids match the clinical data

df = clinical_df.join(mrna_df_transposed, how='inner') # this is the data frame for clinical and genetic data. It has 527 patients (rows) and 20568 features (columns).

In [4]:
#testing around
df["FKSG73"]
unique_values = df["FKSG73"].nunique(dropna=True)
print(unique_values)

counts = Counter(df["FKSG73"].dropna())

# Print unique values and their counts
for item, count in counts.items():
    print(f"{item}: {count}")


3
-3.7272: 171
-1.0: 1
1.0: 1


The following genes appear in the data more than once but have different data: 
['PALM2AKAP2', 'ELMOD1', 'FGF13', 'QSOX1', 'SNAP47', 'NKAIN3', 'TMEM8B']
Right now, I'm leaving every version of the gene in, but giving placeholder unique names like: PALM2AKAP2-1-of-2 and PALM2AKAP2-2-of-2

Removing unecessary columns from clinical data.

In [5]:

def assign_label(row):
    '''given a row assigns 1 for recurrance and 0 for no recurrance. 
    Currently uses NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT to identify recurrance.
    If NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT is NaN, uses DSF_STATUS'''
    if row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'Yes':
        return 1
    elif row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'No':
        return 0
    elif pd.isna(row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT']):
        if row['DFS_STATUS'] == '1:Recurred/Progressed':
            return 1
        elif row['DFS_STATUS'] == '0:DiseaseFree':
            return 0
        else:
            raise ValueError(f"Error: Both columns are NaN at index {row.name}")

def drop_highly_uniform_columns(df, threshold=0.99):
    """
    Drops columns where more than 'threshold' proportion of non-NaN values are the same.

    Parameters:
    - df: pandas DataFrame
    - threshold: float (default 0.99), proportion threshold to drop columns

    Returns:
    - pandas DataFrame with specified columns dropped
    """
    cols_to_drop = []
    for col in df.columns:
        non_na_values = df[col].dropna()
        if not non_na_values.empty:
            top_freq = non_na_values.value_counts(normalize=True).iloc[0]
            if top_freq > threshold:
                cols_to_drop.append(col)
    return df.drop(columns=cols_to_drop)

drop_highly_uniform_columns(df)

# remove the column if over MAX_NULL_VALS percent null values
df = df.dropna(axis=1, thresh=len(df) * (1 - config.MAX_NULL_VALS))

# remove non-informational columns
df = df.drop(columns=['OTHER_PATIENT_ID'])

# Just used for looking at the data for the labels
pair_counts = df.groupby(["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT", 'DFS_STATUS'], dropna=False).size().reset_index(name='Count')

# Print the pairings and the count
print(pair_counts)

# I am going to remove the 32 rows where we have no recurrance label (neither NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT nor DFS_STATUS are known)
# PFS_STATUS
df = df.dropna(subset=['DFS_STATUS', 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'], how='all')

# # numpy array for the Labels for recurrance
labels = np.array(df.apply(assign_label, axis=1)) 
#DIF DFI.time 

  NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT             DFS_STATUS  Count
0                                      No          0:DiseaseFree    326
1                                      No  1:Recurred/Progressed     12
2                                      No                    NaN     49
3                                     Yes          0:DiseaseFree      7
4                                     Yes  1:Recurred/Progressed     38
5                                     Yes                    NaN     34
6                                     NaN          0:DiseaseFree     24
7                                     NaN  1:Recurred/Progressed      5
8                                     NaN                    NaN     32


Transforms data by changing catagorical data into numerical data and filling in missing data points with medians or modes.

In [6]:
nan_counts = df.isnull().sum()
nonzero_nans = nan_counts[nan_counts > 0]
print("Columns with NaN values:")
print(nonzero_nans)


Columns with NaN values:
SUBTYPE                                     19
AGE                                          2
AJCC_STAGING_EDITION                        69
DAYS_LAST_FOLLOWUP                          33
DAYS_TO_BIRTH                                3
DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS         5
ETHNICITY                                  146
NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT     29
PERSON_NEOPLASM_CANCER_STATUS               24
RACE                                        30
RADIATION_THERAPY                            6
WEIGHT                                      20
DSS_STATUS                                   2
DFS_STATUS                                  83
DFS_MONTHS                                  83
dtype: int64


In [12]:
categorical_columns = ["ETHNICITY",
                        "ICD_10", 
                        "PRIOR_DX", 
                        "RACE",
                        "RADIATION_THERAPY", 
                        "IN_PANCANPATHWAYS_FREEZE", 
                        "GENETIC_ANCESTRY_LABEL"] #FIXME: do further research on what ICD_10 and ICD_O_3_SITE are

# Fill numerical NaNs with median
numerical_df = df.select_dtypes(include=['number'])
numerical_df = numerical_df.drop(columns=['OS_MONTHS', 'DSS_MONTHS', 'DFS_MONTHS', 'PFS_MONTHS'])
numerical_df = numerical_df.fillna(numerical_df.median())

# fill catagorical columns with mode
categorical_df = df[categorical_columns]
categorical_df = categorical_df.fillna(categorical_df.mode())


# One-Hot Encode categorical columns (drop first to avoid redundancy)
categorical_df = pd.get_dummies(categorical_df, drop_first=True, dtype=float)

X = pd.concat([numerical_df, categorical_df], axis=1)
print((X.columns))
# feature_names = {i: col for i, col in enumerate(X.columns)}
feature_names = X.columns
X = X.to_numpy()

Index(['AGE', 'AJCC_STAGING_EDITION', 'DAYS_TO_BIRTH',
       'DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS', 'WEIGHT', 'no_symbol_2',
       'UBE2Q2P2', 'HMGB1P1', 'no_symbol_3', 'no_symbol_5',
       ...
       'RADIATION_THERAPY_Yes', 'IN_PANCANPATHWAYS_FREEZE_Yes',
       'GENETIC_ANCESTRY_LABEL_ADMIX', 'GENETIC_ANCESTRY_LABEL_AFR',
       'GENETIC_ANCESTRY_LABEL_AFR_ADMIX', 'GENETIC_ANCESTRY_LABEL_AMR',
       'GENETIC_ANCESTRY_LABEL_EAS', 'GENETIC_ANCESTRY_LABEL_EUR',
       'GENETIC_ANCESTRY_LABEL_EUR_ADMIX', 'GENETIC_ANCESTRY_LABEL_SAS'],
      dtype='object', length=17531)


Saving my split of training and testing data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=1, stratify=labels)

joblib.dump(X_train, config.X_TRAIN_PATH)
joblib.dump(X_test, config.X_TEST_PATH)
joblib.dump(y_train, config.Y_TRAIN_PATH)
joblib.dump(y_test, config.Y_TEST_PATH)

['data/y_test.pkl']