UCEC Reccurance Notebook

Importing data from ucec_tcga_pan_can_atlas_2018

In [69]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

Removing unecessary columns from clinical data.

In [74]:
MAX_NULL_VALS = 0.3

df = pd.read_csv("ucec_tcga_pan_can_atlas_2018\data_clinical_patient.txt", sep="\t", comment="#", low_memory=False)

def assign_label(row):
    '''given a row assigns 1 for recurrance and 0 for no recurrance. 
    Currently uses NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT to identify recurrance.
    If NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT is NaN, uses DSF_STATUS'''
    if row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'Yes':
        return 1
    elif row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'No':
        return 0
    elif pd.isna(row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT']):
        if row['DFS_STATUS'] == '1:Recurred/Progressed':
            return 1
        elif row['DFS_STATUS'] == '0:DiseaseFree':
            return 0
        else:
            raise ValueError(f"Error: Both columns are NaN at index {row.name}")


for col in df.columns:
    # print(f"number of unique values in {col}: {len(df[col].unique())}")  # Show only first 5 unique values
    # print("-" * 50)
    # if there is only one value for every patient, remove the column
    if len(df[col].dropna().unique()) <= 1:
        df.drop([col], axis=1, inplace=True)

# remove the column is over MAX_NULL_VALS percent null values
df.dropna(axis=1, thresh=len(df) * (1 - MAX_NULL_VALS))

# remove non-informational columns
df = df.drop(columns=['PATIENT_ID', 'OTHER_PATIENT_ID'])

pair_counts = df.groupby(['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT', 'DFS_STATUS'], dropna=False).size().reset_index(name='Count')

# Print the pairings and the count
print(pair_counts)

# I am going to remove the 32 rows where we have no recurrance label (neither NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT nor DFS_STATUS are known)
df = df.dropna(subset=['DFS_STATUS', 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'], how='all')

# numpy array for the Labels for recurrance
labels = np.array(df.apply(assign_label, axis=1))

  NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT             DFS_STATUS  Count
0                                      No          0:DiseaseFree    328
1                                      No  1:Recurred/Progressed     12
2                                      No                    NaN     49
3                                     Yes          0:DiseaseFree      7
4                                     Yes  1:Recurred/Progressed     38
5                                     Yes                    NaN     34
6                                     NaN          0:DiseaseFree     24
7                                     NaN  1:Recurred/Progressed      5
8                                     NaN                    NaN     32
84


Transforms data by changing catagorical data into numerical data and filling in missing data points with medians or modes.

In [None]:
categorical_columns = ["ETHNICITY",
                        "ICD_10", 
                        "PRIOR_DX", 
                        "RACE",
                        "RADIATION_THERAPY", 
                        "IN_PANCANPATHWAYS_FREEZE", 
                        "PFS_STATUS", 
                        "GENETIC_ANCESTRY_LABEL"] # do further research on what ICD_10 and ICD_O_3_SITE are

# Fill numerical NaNs with median
numerical_df = df.select_dtypes(include=['number'])
numerical_df = numerical_df.fillna(numerical_df.median())

# fill catagorical columns with mode
categorical_df = df[categorical_columns]
categorical_df = categorical_df.fillna(categorical_df.mode())


# One-Hot Encode categorical columns (drop first to avoid redundancy)
categorical_df = pd.get_dummies(categorical_df, drop_first=True, dtype=float)
# comment, right now I'm making every column be numerical, may change some to boolean if that would also work

clinical_X = pd.concat([numerical_df, categorical_df], axis=1)




0      Yes
1       No
2      Yes
3       No
4      NaN
      ... 
523     No
524     No
525     No
526     No
528    NaN
Name: NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT, Length: 497, dtype: object
