# Combine clinical and survival data with overview of present data for each modality/omics to pick patients

In [None]:
import pandas as pd
import polars as pl
import numpy as np
import bamboolib
from pandas_profiling import ProfileReport

In [None]:
# Load DF
url = "data/TCGA_LUNG_clinical.csv"
combined_df = pd.read_csv(url, index_col=0)
combined_df

### disease_code important

In [None]:
# filter clinical features or take all
'''
combined_df = combined_df[['Sample_ID', '_PATIENT', 'age_at_initial_pathologic_diagnosis',
                                 'days_to_new_tumor_event_after_initial_treatment', 'followup_treatment_success',
                                 'primary_therapy_outcome_success', 'radiation_therapy',
                                 'additional_pharmaceutical_therapy', 'additional_radiation_therapy',
                                 'additional_surgery_locoregional_procedure', 
                                 'other_dx', 'eastern_cancer_oncology_group', 'year_of_initial_pathologic_diagnosis',
                                 'gender.demographic', 'OS', 'OS.time']]
'''

In [None]:
cancer = "LUNG"

# modalities input single LUAD
if cancer == "LUAD":
    in_out_dict= {
                   'Methylation': "data/TCGA_LUAD_Methylation_450.csv",
                   'RNA_seq': "data/LUAD_RNA_seq.csv",
                   'miRNA': "data/TCGA_LUAD_miRNA.csv",
                   'Somatic_Mutation': "data/TCGA_LUAD_mutation2.csv",
                   'CNV': "data/TCGA_LUAD_CNV_gene.csv",
                   'Protein_Array': "data/LUAD_Protein_Array.csv",
                   'Metagenomics':"data/TCGA_LUAD_Metagenomics.csv" # on patient level
                   }
elif cancer == "LUNG":
# modalities input LUNG
    in_out_dict= {
                   'Methylation': "data/TCGA_LUNG_Methylation_450.csv",
                   'RNA_seq': "data/LUNG_RNA_seq.csv",
                   #'miRNA': r'data\TCGA_LUNG_miRNA.csv',
                   'Somatic_Mutation': "data/TCGA_LUNG_mutation2.csv",
                   'CNV': "data/TCGA_LUNG_CNV_gene.csv",
                   'Protein_Array': "data/LUNG_Protein_Array_Gene_Level.csv",
                   # metagenomics are on patient level
                   #'Metagenomics':r'data\TCGA_LUNG_Metagenomics.csv'
                   }
else:
    # modalities input Breast
    in_out_dict = {
                   'Methylation': "data/TCGA_BRCA_Methylation_450.csv",
                   'RNA_seq': "data/TCGA_BRCA_RNA_seq.csv",
                   'miRNA': "data/TCGA_BRCA_miRNA.csv",
                   'Somatic_Mutation': "data/TCGA_BRCA_mutation2.csv",
                   'CNV': "data/TCGA_BRCA_CNV_gene.csv"
                   }

In [None]:
modalities = ['Methylation', 'RNA_seq', 'Somatic_Mutation', 'CNV', 'Protein_Array']

# add 1 if in ids
ids_samples = combined_df['Sample_ID'].to_list()
#ids_patients = combined_df['_PATIENT'].to_list()

for name, path in in_out_dict.items():
    print('Loading : ' + name)
    df = pl.read_csv(path, index_col=0) #polars
    if name == 'Metagenomics':
        ids = df['Patient_ID'].to_list()
        match = list(set(ids) & set(ids_patients))
        combined_df[name] = np.where(combined_df['_PATIENT'].isin(match), 1, np.nan)
    else:
        ids = df['Sample_ID'].to_list()
        match = list(set(ids) & set(ids_samples))
        combined_df[name] = np.where(combined_df['Sample_ID'].isin(match), 1, np.nan)

In [None]:
print('Before: ' + str(len(combined_df.index)))
# select for at least 50% present modalities in patients
combined_df['Sum'] = combined_df[modalities].sum(axis=1)
combined_df = combined_df[combined_df['Sum'] > 2]
combined_df =  combined_df.drop('Sum', axis=1)
print('After: ' + str(len(combined_df.index)))

In [None]:
# pandas profiling report for inspection
profile = ProfileReport(combined_df, title="Pandas Profiling Report")
profile.to_file("overview_report.html")

In [None]:
# save for inspection and further use
combined_df.to_excel("overview_table.xlsx")
combined_df.to_csv("data/TCGA_LUNG_overview_table.csv")

## Figures

In [None]:
import missingno
#clear clinical prior
fig = missingno.matrix(combined_df, fontsize=20)
fig

In [None]:
fig_copy = fig.get_figure()
fig_copy.savefig('data/Missingno_plot_overview.png', bbox_inches = 'tight')