In [None]:
import bamboolib as bam
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Survival

In [None]:
# Survival
url = r"Z:\HiWi\Popp\TCGA_NSCLC_2022\LUAD\Clinic\LUAD.clin.merged.txt"
df_EHR= pd.read_csv(url, sep='\t', index_col=0)

In [None]:
import pandas as pd; import numpy as np
df_EHR = df_EHR.reset_index()
df_EHR

In [None]:
df_EHR.columns[10:110]

## old

In [None]:
url = r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUSC\Clinic\TCGA-LUSC.survival.tsv'
df_surv_LUSC = pd.read_csv(url, sep='\t', index_col=0)

#combine LUAD and LUSC
if list(df_surv_LUAD.columns) == list(df_surv_LUSC.columns): #if same order
    df_surv = pd.concat([df_surv_LUAD, df_surv_LUSC], axis = 0) 
    
df_surv.reset_index(inplace=True)
df_surv.rename(columns = {'sample':'Sample_ID'}, inplace = True)
df_surv

## Clinical overview

In [None]:
# Clinical
url_clinical = r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUAD\Clinic\TCGA-LUAD.GDC_phenotype.tsv'
df_clinical_LUAD = pd.read_csv(url_clinical, sep='\t', index_col=0)
url_clinical = r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUSC\Clinic\TCGA-LUSC.GDC_phenotype.tsv'
df_clinical_LUSC = pd.read_csv(url_clinical, sep='\t', index_col=0)

#combine LUAD and LUSC
if list(df_clinical_LUAD.columns) == list(df_clinical_LUSC.columns): #if same order
    df_clinical = pd.concat([df_clinical_LUAD, df_clinical_LUSC], axis = 0) 
    
df_clinical.reset_index(inplace=True)
df_clinical.rename(columns = {'submitter_id.samples':'Sample_ID'}, inplace = True)
df_clinical

In [None]:
# kick healthy samples and duplicate Samples
df_clinical = df_clinical[df_clinical['Sample_ID'].str[-3:] == '01A']
df_clinical.drop_duplicates(inplace=True)
df_clinical.shape

## Metafeatures like TMB

In [None]:
# TMB
url = r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUNG\TCGA_LUNG_TMB.csv'
df_TMB = pd.read_csv(url, index_col=0)
df_TMB

## Combine dataframes into overview

In [None]:
# match clinical and surv
combined_df = pd.merge(df_clinical, df_surv, how="outer", on='Sample_ID')
combined_df = pd.merge(combined_df, df_TMB, how='outer', on='Sample_ID')

combined_df = combined_df[df_clinical['Sample_ID'].str[-3:] == '01A']
combined_df.drop_duplicates(inplace=True)

# kick samples without OS or gender info (surrogate for all therapy infos)
combined_df = combined_df[combined_df['OS'].notna()]
combined_df = combined_df[combined_df['gender.demographic'].notna()] 
combined_df.reset_index(drop=True, inplace=True)
combined_df

#### inspect missing features

In [None]:
df = combined_df[['Sample_ID', 'OS', 'OS.time', 'age_at_initial_pathologic_diagnosis', 'year_of_initial_pathologic_diagnosis', 'cigarettes_per_day.exposures', 'pack_years_smoked.exposures', 'prior_malignancy.diagnoses', 'tumor_stage.diagnoses', 'year_of_diagnosis.diagnoses', 'pathologic_T', 'pathologic_M', 'pathologic_N', 'gender.demographic', 'ethnicity.demographic', 'race.demographic']]
df.head()

In [None]:
msno.bar(df)

In [None]:
df.dtypes

In [None]:
# pandas profiling report for inspection
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_widgets()

In [None]:
# save complete df for inspection and further use
combined_df.to_csv(r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUNG\TCGA_LUNG_clinical.csv')

## Make equal sized labels for MoGCN 

In [None]:
#change type
to_bool = ['prior_malignancy.diagnoses'] #no  yes
to_int = ['']
to_categorical = ['']

#reconstruct features
map_stage = {
    'not reported':np.NaN,
    'stage ia':0,
    'stage i':0,
    'stage ib':1,
    'stage iia':2,
    'stage ii':2,
    'stage iib':2,
    'stage iiia':3,
    'stage iii':3,
    'stage iiib':3,
    'stage iv':3
    }
df["tumor_stage.diagnoses"] = df["tumor_stage.diagnoses"].replace(map_stage)

#imputation needed for 
numeric = ['age_at_initial_pathologic_diagnosis', 'year_of_initial_pathologic_diagnosis', 'cigarettes_per_day.exposures', 'pack_years_smoked.exposures']
categorical = ['']
boolean = ['']

df['tumor_stage.diagnoses'].value_counts()
#lasat save with cell above

## Make label df for tumor stage 1-4 for MoGCN

In [None]:
df_cat = df[['sample']]
df_cat['label'] = df['tumor_stage.diagnoses'].replace(map_stage).astype("string")
df_cat.dropna(inplace=True)
df_cat['label'].value_counts()

In [None]:
df_cat.to_csv(r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUAD\Clinic\TCGA_LUAD_Tumor_Label.csv')

## Make clinical feature vector for training starting from [5]

In [None]:
import pandas as pd; import numpy as np
# Step: Drop duplicates based on ['Sample_ID']
combined_df = combined_df.drop_duplicates(subset=['Sample_ID'], keep='first')
combined_df.replace({pd.NA: np.nan}, inplace = True)

In [None]:
# Step: Select columns
combined_df_select = combined_df[['Sample_ID', 'primary_diagnosis.diagnoses', 'age_at_initial_pathologic_diagnosis', 'number_pack_years_smoked', 'tobacco_smoking_history', 'other_dx', 'prior_malignancy.diagnoses', 'pathologic_M', 'pathologic_N', 'pathologic_T', 'tumor_stage.diagnoses', 'person_neoplasm_cancer_status', 'gender.demographic', 'race.demographic']]

# Step: Change data type of ['Sample_ID', 'location_in_lung_parenchyma', 'other_dx', 'prior_malignancy.diagnoses', 'pathologic_M', 'pathologic_N', 'pathologic_T', 'tumor_stage.diagnoses', 'person_neoplasm_cancer_status', 'gender.demographic', 'race.demographic'] to String/Text
for column_name in ['Sample_ID', 'other_dx', 'prior_malignancy.diagnoses', 'pathologic_M', 'pathologic_N', 'pathologic_T', 'tumor_stage.diagnoses', 'person_neoplasm_cancer_status', 'gender.demographic', 'race.demographic']:
    combined_df_select[column_name] = combined_df_select[column_name].astype('string')

combined_df_select

In [None]:
#Categories for features
#Sample_ID, number_pack_years_smoked, tobacco_smoking_history, other_dx (other malignancy) prior_malignancy.diagnoses, pathologic_M, pathologic_N, pathologic_T, tumor_stage.diagnoses, person_neoplasm_cancer_status,  gender.demographic, race.demographic 
#low: eastern_cancer_oncology_group, karnofsky_performance_score, performance_status_scale_timing,  location_in_lung_parenchyma 
#biomarkers: egfr_mutation_performed --> egfr_mutation_result, eml4_alk_translocation_performed, kras_gene_analysis_performed --> kras_mutation_found
#future: followup_treatment_success, new_tumor_event_after_initial_treatment, postoperative_rx_tx, primary_therapy_outcome_success, radiation_therapy
#calc: year_of_initial_pathologic_diagnosis or age_at_diagnosis.diagnoses or year_of_diagnosis.diagnoses or age_at_initial_pathologic_diagnosis. diagnosis,, age_at_index.demographic or days_to_birth.demographic or year_of_birth.demographic, stopped_smoking_year - patient sample date, pre and post bronchodilator_fev1_percent --> dist?, year_of_tobacco_smoking_onset? diff, location 'location_in_lung_parenchyma' or diagnosis morphology

In [None]:
#numerics
combined_df_select = combined_df_select.replace({pd.NA: np.nan})
combined_df_select['other_dx'] = [0 if x == 'No' else 1 for x in combined_df_select['other_dx'] ]
combined_df_select['prior_malignancy.diagnoses'] = [1 if 'no' in x else 1 for x in combined_df_select['prior_malignancy.diagnoses'] ]
combined_df_select['person_neoplasm_cancer_status'] = [np.nan if pd.isna(x) else 1 if x =='WITH TUMOR' else 0 for x in combined_df_select['person_neoplasm_cancer_status'] ]
combined_df_select['tumor_stage.diagnoses'] = [0 if pd.isna(x) else 1 if 'stage iii' in x or 'stage iv' in x else 0 for x in combined_df_select['tumor_stage.diagnoses'] ]
combined_df_select['gender.demographic'] = [0 if x == 'male' else 1 for x in combined_df_select['gender.demographic']]

In [None]:
#categoricals
combined_df_select['pathologic_M'] = ['M0' if pd.isna(x) else 'M1' if 'M1' in x else x for x in combined_df_select['pathologic_M'] ]
combined_df_select['pathologic_N'] = [ 'N0/NX' if pd.isna(x) or x in ['N0', 'NX'] else 'N1' if x == 'N1' else 'N2/N3' for x in combined_df_select['pathologic_N'] ]
combined_df_select['pathologic_T'] = ['T1' if 'T1' in x else 'T2' if 'T2' in x else 'T3' if 'T3' in x else 'T4' if 'T4' in x else 'T1' for x in combined_df_select['pathologic_T'] ]
combined_df_select['race.demographic'] = [x if x in ['white', 'not reported'] else 'other' for x in combined_df_select['race.demographic'] ]
combined_df_select['primary_diagnosis.diagnoses'] = [x if x in ['Squamous cell carcinoma, NOS', 'Adenocarcinoma, NOS', 'Adenocarcinoma with mixed subtypes'] else 'unreported' if pd.isna(x) else 'other' for x in combined_df_select['primary_diagnosis.diagnoses'] ]

In [None]:
# OneHotEncode for categorical
features_to_dummy = ['pathologic_M', 'pathologic_N', 'pathologic_T', 'gender.demographic', 'race.demographic']
combined_df_select = pd.get_dummies(combined_df_select, prefix_sep='_', drop_first=False)

In [None]:
#Imputation
from sklearn.impute import KNNImputer
features_to_impute = ['age_at_initial_pathologic_diagnosis', 'number_pack_years_smoked', 'person_neoplasm_cancer_status']
combined_df_select['tobacco_smoking_history'] = combined_df_select['tobacco_smoking_history'].fillna(combined_df_select['tobacco_smoking_history'].median())
#for int and float
combined_df_select.iloc[:,1:] = KNNImputer().fit_transform(combined_df_select.iloc[:,1:].values) 

In [None]:
#log2
combined_df_select['number_pack_years_smoked'] = np.log2(combined_df_select['number_pack_years_smoked'].values)

In [None]:
# scale 0 - 1
from sklearn.preprocessing import MinMaxScaler
combined_df_select.iloc[:,1:] = MinMaxScaler().fit_transform(combined_df_select.iloc[:,1:].values) 

In [None]:
# save complete df for inspection and further use
combined_df_select.to_csv(r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUNG\TCGA_LUNG_clinical_input_features.csv')

In [None]:
df_ = pd.read_csv(r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUNG\TCGA_LUNG_clinical_input_features.csv', index_col=0)

In [None]:
df_