In [1]:
import os
import sys
import glob
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
import matplotlib.patches as mpatches
from matplotlib.colors import LogNorm
sys.path.append("../../Utils")
from loaders import HNSCCFeatureHandler

METADATA_PATH = "../../Supplementary_Tables/ST1/RAW_HNSCC_METADATA_NEW_IMPUTED.csv"
VALID_IDS_PATH = "../../Utils/Lists/cv_ids.txt"
HOLD_IDS_PATH = '../../Utils/Lists/holdout_ids.txt'

hc_data = HNSCCFeatureHandler(METADATA_PATH, VALID_IDS_PATH, HOLD_IDS_PATH)
_ = hc_data.load_feature_to_dataframe("../../Data/*.hg38.frag.interval_mds.tsv", 0, 4)
z = hc_data.normalize_zscore()
_ = hc_data.merge_feature_metadata()
z = hc_data.batch_correct()

with open('predicted_treatment_response_dict.pkl', 'rb') as f:
    predicted = pickle.load(f)

with open('treatment_response_dict.pkl', 'rb') as f:
    truth = pickle.load(f)

with open('tumor_fraction_from_median.pkl', 'rb') as f:
    tumor = pickle.load(f)

In [2]:
s = hc_data.get_metadata_col('Patient Number', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
switched_df = s.reset_index().drop_duplicates(subset='Patient Number', keep='first')
switched = switched_df.set_index('Patient Number')['index']

In [3]:
predicted = {switched.get(k, k): v for k, v in predicted.items()}
tumor = {switched.get(k, k): v for k, v in tumor.items()}

In [4]:
surv_time = hc_data.get_metadata_col('Survival Months', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
surv_status = hc_data.get_metadata_col('E_Survival', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
treatment_response = hc_data.get_metadata_col('Stratification', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
diagnosis = hc_data.get_metadata_col('Diagnosis', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
age = hc_data.get_metadata_col('Age', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
gender = hc_data.get_metadata_col('Gender', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
smoking = hc_data.get_metadata_col('Smoking', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
alcohol = hc_data.get_metadata_col('Alcohol', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
race = hc_data.get_metadata_col('Race', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
ethnicity = hc_data.get_metadata_col('Ethnicity', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
hpv = hc_data.get_metadata_col('HPV', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
truth_response = hc_data.get_metadata_col('Treatment Response', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
path_response = hc_data.get_metadata_col('Pathological Response', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))

relapse_time = hc_data.get_metadata_col('Relapse Months', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
relapse_status = hc_data.get_metadata_col('E_Relapse', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
pdl1_ihc = hc_data.get_metadata_col('PDL1 IHC', df=pd.concat([hc_data.data, hc_data.hold_data], axis=0))
pdl1_ihc = pdl1_ihc.replace({0: "0", 1: "1-19", 2: ">20"})

surv_time = surv_time[surv_time.index.isin(predicted.keys())]
surv_status = surv_status[surv_status.index.isin(predicted.keys())].astype(bool)
treatment_response = treatment_response[treatment_response.index.isin(predicted.keys())]
relapse_time = relapse_time[relapse_time.index.isin(predicted.keys())]
relapse_status = relapse_status[relapse_status.index.isin(predicted.keys())].astype(bool)
pdl1_ihc = pdl1_ihc[pdl1_ihc.index.isin(predicted.keys())]

# Create DataFrame
merged_df = pd.concat([surv_time, surv_status, treatment_response, diagnosis, age, gender, smoking, alcohol, hpv, race, ethnicity, relapse_time, relapse_status, pdl1_ihc, truth_response, path_response], axis=1)
merged_df.columns = ['Survival Months', 'E_Survival', 'Stratification', "Diagnosis", "Age", "Gender", "Smoking", "Alcohol", "HPV", "Race", "Ethnicity", 'Relapse Months', 'E_Relapse', 'PDL1 IHC', "Actual Treatment Response", "Pathological Response"]
merged_df['E_Survival'] = merged_df['E_Survival'].astype(bool)
merged_df['E_Relapse'] = merged_df['E_Relapse'].astype(bool)
merged_df['Predicted Treatment Response'] = merged_df.index.map(predicted)
merged_df['Tumor Fraction'] = merged_df.index.map(tumor)
merged_df.dropna(inplace=True)

In [5]:
merged_df["Predicted Treatment Response with Stratification"] = (
    merged_df["Predicted Treatment Response"].astype(str) + "_" + merged_df["Stratification"].astype(str)
)

In [6]:
merged_df["IHC with Stratification"] = (
    merged_df["PDL1 IHC"].astype(str) + "_" + merged_df["Stratification"].astype(str)
)

In [7]:
merged_df["Tumor Fraction with Stratification"] = (
    merged_df["Tumor Fraction"].astype(str) + "_" + merged_df["Stratification"].astype(str)
)

In [8]:
merged_df

Unnamed: 0,Survival Months,E_Survival,Stratification,Diagnosis,Age,Gender,Smoking,Alcohol,HPV,Race,...,Relapse Months,E_Relapse,PDL1 IHC,Actual Treatment Response,Pathological Response,Predicted Treatment Response,Tumor Fraction,Predicted Treatment Response with Stratification,IHC with Stratification,Tumor Fraction with Stratification
Pilot2_3,18.070000,True,High,Oral Cavity Tumor,44,Female,Yes,No,Unknown,White,...,10.450000,True,1-19,Non-Responder,NPR,Non-Responder,Low Tumor Fraction,Non-Responder_High,1-19_High,Low Tumor Fraction_High
Pilot2_6,2.600000,True,High,Oral Cavity Tumor,51,Male,No,No,Unknown,White,...,1.480000,True,1-19,Non-Responder,NPR,Non-Responder,Low Tumor Fraction,Non-Responder_High,1-19_High,Low Tumor Fraction_High
Pilot2_9,48.090000,False,High,Oral Cavity Tumor,71,Female,Yes,No,Unknown,White,...,48.090000,False,1-19,Non-Responder,NPR,Non-Responder,High Tumor Fraction,Non-Responder_High,1-19_High,High Tumor Fraction_High
Pilot2_12,53.622277,False,Intermediate,Larynx Tumor,75,Male,Yes,No,Unknown,White,...,52.010556,False,1-19,Non-Responder,NPR,Non-Responder,Low Tumor Fraction,Non-Responder_Intermediate,1-19_Intermediate,Low Tumor Fraction_Intermediate
Pilot2_14,22.040000,True,High,Oral Cavity Tumor,70,Female,No,No,Unknown,White,...,8.710000,True,1-19,Non-Responder,NPR,Non-Responder,Low Tumor Fraction,Non-Responder_High,1-19_High,Low Tumor Fraction_High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pilot_215,69.970000,False,Intermediate,Oral Cavity Tumor,53,Male,No,Yes,No,White,...,69.970000,False,>20,Responder,Major,Responder,Low Tumor Fraction,Responder_Intermediate,>20_Intermediate,Low Tumor Fraction_Intermediate
Pilot_225,57.130000,False,High,Oral Cavity Tumor,61,Male,Yes,Yes,Unknown,White,...,57.130000,False,0,Responder,Partial,Responder,High Tumor Fraction,Responder_High,0_High,High Tumor Fraction_High
Pilot_231,41.430000,False,Intermediate,Oral Cavity Tumor,56,Male,No,No,Unknown,White,...,41.430000,False,1-19,Responder,Major,Non-Responder,High Tumor Fraction,Non-Responder_Intermediate,1-19_Intermediate,High Tumor Fraction_Intermediate
Pilot_252,64.490000,False,High,Oral Cavity Tumor,72,Female,Yes,Yes,Unknown,Unknown,...,64.490000,False,1-19,Non-Responder,NPR,Non-Responder,High Tumor Fraction,Non-Responder_High,1-19_High,High Tumor Fraction_High


In [9]:
merged_df.to_csv('survival_data_imputed.csv')