This notebook is used to load all the data present in the **Source_Data** folder for the different types of sources.

**Goal**: to have an overview about all the type of data available and obtain the final merged dataset composed by all the informations about the 393 patients and 353 features. A list of all the features is saved in the file **Merged_Features.txt** and the whole dataset in **su2c_merge_master.csv**.  

### Import Utils and Setup

In [78]:
## Communication drive-colab
from google.colab import drive
import warnings
import os

## Data Structure and Data Analysis
import pandas as pd
import numpy as np

In [79]:
# Mount drive
drive.mount("/content/drive", force_remount = True)

Mounted at /content/drive


In [80]:
## Toggle as needed
warnings.filterwarnings("ignore")

## To see the maximum number of columns
pd.set_option("display.max_columns", None)

## Save path
source_path_data = "/content/drive/MyDrive/Tesi/Code/Source Data/"

### Read Data

In [81]:
## Import different type of data from their respective folders
source_path_data_clinical = source_path_data + "Clinical/"
source_path_data_exome    = source_path_data + "Exome/"
source_path_data_rna      = source_path_data + "RNA/"
source_path_data_ref      = source_path_data + "Reference/"
source_path_data_int      = source_path_data + "Integrative/"
source_path_data_out      = source_path_data + "Output/"
source_path_data_tcga     = source_path_data + "TCGA/"

#### Read Reference Data

In [82]:
## Load Data
panlung_drivers = pd.read_csv(source_path_data_ref + "panlung_drivers.txt", sep = "\t", header = None).iloc[:, 0].to_list()
panlung_amps    = pd.read_csv(source_path_data_ref + "panlung_amps.txt", sep = "\t", header = None).iloc[:, 0].to_list()
panlung_dels    = pd.read_csv(source_path_data_ref + "panlung_dels.txt", sep = "\t", header = None).iloc[:, 0].to_list()

In [83]:
## Check
print("Panlung Drivers: ", panlung_drivers)
print("N: ", len(panlung_drivers))
print("Panlung Amps: ", panlung_amps)
print("N: ", len(panlung_amps))
print("Panlung Dels: ", panlung_dels)
print("N: ", len(panlung_dels))

Panlung Drivers:  ['TP53', 'KRAS', 'KEAP1', 'EGFR', 'CDKN2A', 'NFE2L2', 'SMARCA4', 'PTEN', 'RBM10', 'ARID1A', 'RB1', 'STK11', 'KMT2D', 'NF1', 'BRAF', 'FAT1', 'CMTR2', 'SETD2', 'COL5A2', 'MGA', 'CUL3', 'SMAD4', 'U2AF1', 'RASA1', 'NOTCH1', 'KMT2C', 'ERBB2', 'FBXW7', 'ARHGAP35', 'RIT1', 'ZFP36L1', 'ATM', 'HRAS', 'MET', 'FANCM', 'B2M', 'DSN1', 'ITGBL1', 'KLF5', 'NRAS', 'PIK3CA', 'PRAG1', 'ATF7IP', 'HLA-A', 'CTNNB1', 'APC', 'NCOA6', 'ELL2', 'CREBBP', 'EP300', 'PLXNB2', 'LATS1', 'KLHL5']
N:  53
Panlung Amps:  ['NKX2-1', 'MYC', 'TERT', 'MCL1', 'KRAS', 'CDK4', 'CCND1', 'MDM2', 'MECOM', 'TERC', 'EGFR', 'CCNE1', 'ERBB2', 'CCND3', 'ZNF217', 'TUBD1', 'KAT6A', 'MAPK1', 'MET', 'NSD3', 'FGFR1', 'SOX2', 'PDGFRA', 'KIT', 'KDR', 'MDM2', 'REL', 'BCL11A', 'BCL2L1', 'NFE2L2', 'CDK6', 'IGF1R', 'PTP4A1', 'PHF3', 'MYCL', 'KDM5A']
N:  36
Panlung Dels:  ['CDKN2A', 'PTEN', 'FAT1', 'B2M', 'FOXP1', 'RB1', 'ZMYND11', 'CREBBP', 'ROBO1', 'KMT2C', 'USP22', 'NF1', 'SMAD4', 'KDM6A', 'TRAF3']
N:  15


#### Read Clinical Data

This dataset contains Clinical Annotations for the SU2C-MARK Cohort

In [84]:
## Load
annot_file = "Table_S1_Clinical_Annotations.xlsx"
clinical_data = pd.read_excel(source_path_data_clinical + annot_file, skiprows = 2)

## Extend
clinical_data_extra = pd.read_csv\
    (source_path_data_clinical + "SU2C-MARK_Harmonized_Clinical_Annotations_Supplement_v1.txt", sep = "\t")

In [85]:
## Check
print("Shape is: ", clinical_data.shape)
display(clinical_data.head())

Shape is:  (393, 38)


Unnamed: 0,WES_Cohort_1,WES_Cohort_2,WES_All,RNA_Cohort_1,RNA_Cohort_2,RNA_All,Institution,Harmonized_SU2C_Participant_ID_v2,Harmonized_SU2C_WES_Tumor_Sample_ID_v2,Harmonized_SU2C_WES_Normal_Sample_ID_v2,Harmonized_SU2C_RNA_Tumor_Sample_ID_v2,Pre-treatment_RNA_Sample_QC,Patient_Age_at_Diagnosis,Patient_Sex,Patient_Race,Patient_Smoking_Status,Patient_Smoking_Pack_Years_Harmonized,Histology_Harmonized,Histology_Detail,Initial_Stage,Initial_Stage_Substage,PDL1_TPS,PDL1_TPS_Description,Local_Antibody_Clone,Clinical_Driver,Sequencing_Platform,Advanced_Diagnosis_Date,Line_of_Therapy,Agent_PD1,Agent_PD1_Category,Prior_Platinum,Prior_TKI,Harmonized_PFS_Event,Harmonized_PFS_Days,Harmonized_Confirmed_BOR,Harmonized_BOR_RECIST,Harmonized_OS_Event,Harmonized_OS_Days
0,1.0,,1.0,,,,Cleveland Clinic,SU2CLC-CLE-NIVO1,SU2CLC-CLE-NIVO1-T1,SU2CLC-CLE-NIVO1-N1,,,61,F,0.0,1.0,40.0,Squamous,,3.0,A,,,,,,-39.0,2.0,Nivolumab,PD(L)1,1.0,0.0,1.0,53.0,PD,,1.0,434.0
1,,,,1.0,,1.0,Cleveland Clinic,SU2CLC-CLE-NIVO10,,,SU2CLC-CLE-NIVO10-T1,Flag,55,F,2.0,1.0,30.0,Adeno,,4.0,,,,,,,-321.0,3.0,Nivolumab,PD(L)1,1.0,0.0,1.0,63.0,PD,,1.0,86.0
2,1.0,,1.0,1.0,,1.0,Cleveland Clinic,SU2CLC-CLE-NIVO18,SU2CLC-CLE-NIVO18-T1,SU2CLC-CLE-NIVO18-N1,SU2CLC-CLE-NIVO18-T1,Keep,68,F,0.0,0.0,0.0,Adeno,,2.0,A,,,,EGFR,,-533.0,4.0,Nivolumab,PD(L)1,1.0,1.0,1.0,50.0,PD,,1.0,161.0
3,1.0,,1.0,1.0,,1.0,Cleveland Clinic,SU2CLC-CLE-NIVO19,SU2CLC-CLE-NIVO19-T1,SU2CLC-CLE-NIVO19-N1,SU2CLC-CLE-NIVO19-T1,Keep,57,F,2.0,2.0,15.0,Adeno,,4.0,,,,,,,-35.0,1.0,Nivolumab,PD(L)1,0.0,0.0,1.0,297.0,PR,,1.0,297.0
4,,,,1.0,,1.0,Cleveland Clinic,SU2CLC-CLE-NIVO2,,,SU2CLC-CLE-NIVO2-T1,Keep,63,F,0.0,0.0,0.0,Adeno,,4.0,,,,,,,-262.0,4.0,Nivolumab,PD(L)1,1.0,1.0,1.0,68.0,PD,,1.0,123.0


In [86]:
## Check
print("Shape is: ", clinical_data_extra.shape)
display(clinical_data_extra.head())

Shape is:  (393, 7)


Unnamed: 0,Harmonized_SU2C_Participant_ID_v2,Harmonized_Confirmed_BOR_Bin,Harmonized_Confirmed_BOR_3_Cat,Harmonized_PFS_Months,Harmonized_OS_Months,Advanced_Diagnosis_Date_Positive,Patient_Smoking_Pack_Years_Harmonized_Log
0,SU2CLC-CLE-NIVO1,0.0,PD,1.74113,14.257556,39.0,3.713572
1,SU2CLC-CLE-NIVO10,0.0,PD,2.069645,2.82523,321.0,3.433987
2,SU2CLC-CLE-NIVO18,0.0,PD,1.642576,5.289093,533.0,0.0
3,SU2CLC-CLE-NIVO19,1.0,CR/PR,9.756899,9.756899,35.0,2.772589
4,SU2CLC-CLE-NIVO2,0.0,PD,2.233903,4.040736,262.0,0.0


#### Read Exome Data

In [87]:
## Load Data
su2c_val_harm_dam_pv_bin = pd.read_csv(source_path_data_exome + "SU2C-MARK_Harmonized_Validated_Damaging_Pivot_Bin_v1.txt", sep = "\t")

## Take only some columns
su2c_val_harm_dam_pv_drivers = su2c_val_harm_dam_pv_bin.loc[:, su2c_val_harm_dam_pv_bin.columns.isin(panlung_drivers)]
su2c_val_harm_dam_pv_drivers["Tumor_Sample_Barcode"] = su2c_val_harm_dam_pv_bin["Tumor_Sample_Barcode"]

In [88]:
## Check
## Consider only drivers genes
print("Shape is: ", su2c_val_harm_dam_pv_drivers.shape)
display(su2c_val_harm_dam_pv_drivers.head())

Shape is:  (309, 53)


Unnamed: 0,APC,ARHGAP35,ARID1A,ATF7IP,ATM,B2M,BRAF,CDKN2A,CMTR2,COL5A2,CREBBP,CTNNB1,CUL3,DSN1,EGFR,ELL2,EP300,ERBB2,FANCM,FAT1,FBXW7,HLA-A,HRAS,ITGBL1,KEAP1,KLF5,KLHL5,KMT2C,KMT2D,KRAS,LATS1,MET,MGA,NCOA6,NF1,NFE2L2,NOTCH1,NRAS,PIK3CA,PLXNB2,PTEN,RASA1,RB1,RBM10,RIT1,SETD2,SMAD4,SMARCA4,STK11,TP53,U2AF1,ZFP36L1,Tumor_Sample_Barcode
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,SU2CLC-CLE-NIVO1-T1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,SU2CLC-CLE-NIVO18-T1
2,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,SU2CLC-CLE-NIVO19-T1
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,SU2CLC-CLE-NIVO3-T1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,SU2CLC-CLE-NIVO4-T1


In [89]:
## Check
## Consider all possible genes
print("Shape is: ", su2c_val_harm_dam_pv_bin.shape)
display(su2c_val_harm_dam_pv_bin.head())

Output hidden; open in https://colab.research.google.com to view.

In [90]:
## Load Data
su2c_tmb_sig_tmb_harm = pd.read_csv(source_path_data_exome + "SU2C-MARK_Harmonized_Exome_TMB_Signatures_v1.txt", sep = "\t")
su2c_tmb_sig_tmb_harm_log = su2c_tmb_sig_tmb_harm.set_index("Harmonized_SU2C_WES_Tumor_Sample_ID_v2").\
    applymap(lambda x: np.log10(x+1))
su2c_tmb_sig_tmb_harm_log.rename(columns = lambda x: "log_" + x, inplace = True)

In [91]:
## Check
print("Shape is: ", su2c_tmb_sig_tmb_harm_log.shape)
display(su2c_tmb_sig_tmb_harm_log.head())

Shape is:  (309, 11)


Unnamed: 0_level_0,log_TMB,log_TMB_clonal,log_TMB_subclonal,log_TMB_indel,log_Neoantigens,log_Neoantigens_clonal,log_Neoantigens_subclonal,log_Subclone_count,log_Aging_Signature,log_Smoking_Signature,log_APOBEC_Signature
Harmonized_SU2C_WES_Tumor_Sample_ID_v2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SU2CLC-CLE-NIVO1-T1,3.003029,2.338456,2.853698,0.60206,2.190332,1.414973,2.068186,0.69897,2.017033,2.20412,2.868644
SU2CLC-CLE-NIVO18-T1,2.176091,2.127105,0.0,0.778151,1.591065,1.556303,0.0,0.0,1.70757,1.591065,1.278754
SU2CLC-CLE-NIVO19-T1,2.780317,2.729165,1.20412,1.041393,2.100371,2.045323,0.954243,0.30103,1.672098,2.682145,1.69897
SU2CLC-CLE-NIVO3-T1,1.929419,1.653213,1.568202,0.477121,0.845098,0.30103,0.778151,0.60206,1.544068,1.414973,0.954243
SU2CLC-CLE-NIVO4-T1,2.238046,2.029384,1.819544,0.0,1.39794,1.176091,1.0,0.477121,1.919078,1.826075,1.041393


In [92]:
## Load Data
su2c_cnv_gene = pd.read_csv(source_path_data_exome + "SU2C-MARK_Harmonized_Gistic_Gene_v1.txt", sep = "\t")
## Set columns to exlude
columns_to_exclude = [1, 2]
su2c_cnv_gene_filtered = su2c_cnv_gene.drop(su2c_cnv_gene.columns[columns_to_exclude], axis = 1).T
## Set colnames
su2c_cnv_gene_filtered.columns = su2c_cnv_gene_filtered.iloc[0]
su2c_cnv_gene_filtered = su2c_cnv_gene_filtered[1:]

## Consider only drivers genes
su2c_cnv_gene_sig = su2c_cnv_gene[su2c_cnv_gene["Gene Symbol"].isin(panlung_amps + panlung_dels)]
su2c_cnv_gene_sig_t = su2c_cnv_gene_sig.iloc[:,3:].T
su2c_cnv_gene_sig_t.columns = su2c_cnv_gene_sig["Gene Symbol"]

In [93]:
## Check
## Considering only drivers genes
print("Shape is: ", su2c_cnv_gene_sig_t.shape)
display(su2c_cnv_gene_sig_t.head())

Shape is:  (309, 47)


Gene Symbol,MYCL,MCL1,BCL11A,REL,NFE2L2,FOXP1,ROBO1,MECOM,TERC,SOX2,PDGFRA,KIT,KDR,FAT1,TERT,CCND3,PTP4A1,PHF3,EGFR,CDK6,MET,KMT2C,FGFR1,KAT6A,MYC,CDKN2A,ZMYND11,PTEN,CCND1,KDM5A,KRAS,CDK4,MDM2,RB1,TRAF3,B2M,IGF1R,CREBBP,USP22,NF1,ERBB2,TUBD1,SMAD4,CCNE1,BCL2L1,ZNF217,MAPK1
SU2CLC-UCD-1124-T1,-0.031,0.658,0.022,0.022,0.022,-0.027,-0.027,-0.027,-0.027,-0.027,0.019,0.019,0.019,0.019,0.747,-0.279,-0.279,-0.279,-0.009,-0.009,-0.009,-0.009,0.01,0.01,0.01,0.093,0.0,0.0,0.012,-0.035,-0.035,-0.035,-0.035,-0.076,-0.37,-0.336,0.442,0.229,-0.338,0.314,0.314,0.314,0.145,-0.087,0.015,0.015,-0.393
SU2CLC-UCD-1145-T1,0.244,1.201,-0.137,-0.137,0.026,-0.434,-0.434,0.081,0.081,-0.457,-0.284,-0.284,-0.284,-0.415,1.869,-0.104,-0.104,-0.104,0.031,0.5,0.5,0.5,0.514,0.514,0.514,-0.051,-0.268,0.094,0.243,-0.056,3.657,-0.051,-0.051,-0.224,0.146,-0.128,0.291,0.0,-0.303,0.106,0.106,-0.264,0.78,0.603,0.538,0.71,-0.286
SU2CLC-UCD-1142-T1,0.0,0.0,0.062,0.062,-0.17,-0.234,-0.234,0.438,0.438,0.438,0.269,0.269,0.269,-0.218,-0.166,-0.073,-0.243,-0.243,0.334,-0.07,0.245,0.462,-0.233,0.282,0.282,0.459,0.146,0.448,0.27,-0.292,0.202,0.202,0.202,-0.254,0.02,-0.114,-0.114,0.025,-0.196,0.263,0.263,0.263,-0.247,-0.208,0.322,0.322,-0.201
SU2CLC-COL-1001-T1,0.0,0.0,-0.013,-0.013,-0.013,0.135,0.135,0.129,0.129,0.129,0.014,0.014,0.014,0.014,-0.024,0.027,0.027,0.027,-0.002,-0.002,-0.002,-0.002,-0.004,-0.004,-0.004,-0.095,-0.004,-0.004,0.003,0.0,0.0,0.0,0.0,-0.104,-0.05,0.072,0.072,0.016,0.02,0.02,0.02,0.02,0.01,-0.104,0.014,0.014,-0.117
SU2CLC-MSK-A2001-T1,0.071,0.071,0.0,0.0,0.0,-0.002,-0.002,-0.002,-0.002,-0.002,-0.068,-0.068,-0.068,-0.068,0.092,-0.099,-0.099,-0.099,0.053,0.053,0.053,0.053,-0.039,-0.039,-0.039,-0.15,-0.208,0.048,0.003,0.0,0.0,0.0,0.0,-0.068,-0.18,-0.092,0.024,0.065,0.084,0.084,0.084,0.084,-0.164,-0.052,0.057,0.057,0.124


In [94]:
## Check
## Considering all genes
print("Shape is: ", su2c_cnv_gene_filtered.shape)
display(su2c_cnv_gene_filtered.head())

Output hidden; open in https://colab.research.google.com to view.

In [95]:
## Load Data
su2c_cnv_peak = pd.read_csv(source_path_data_exome + "SU2C-MARK_Harmonized_Gistic_Focal_v1.txt", sep = "\t")
su2c_cnv_peak_actual = su2c_cnv_peak[su2c_cnv_peak["Amplitude Threshold"] == "Actual Copy Change Given"]
su2c_cnv_peak_actual["Descriptor"] = su2c_cnv_peak_actual.apply(lambda x: "Amp_" + x["Descriptor"] if "Amp" in x["Unique Name"] \
                           else "Del_" + x["Descriptor"], axis = 1)
su2c_cnv_peak_actual_t = su2c_cnv_peak_actual.set_index("Descriptor").iloc[:,8:].T

In [96]:
## Check
print("Shape is: ", su2c_cnv_peak_actual_t.shape)
display(su2c_cnv_peak_actual_t.head())

Shape is:  (309, 68)


Descriptor,Amp_1q21.3,Amp_2p16.1,Amp_3q26.2,Amp_3q27.1,Amp_4q12,Amp_5p15.33,Amp_6p21.1,Amp_7p15.3,Amp_7p11.2,Amp_7q21.12,Amp_8p11.23,Amp_8q24.21,Amp_9p13.3,Amp_11p13,Amp_11q13.3,Amp_12p12.1,Amp_12q14.1,Amp_12q15,Amp_13q34,Amp_14q13.3,Amp_17q24.2,Amp_19q12,Amp_20q11.21,Amp_20q13.33,Del_1p36.12,Del_1p13.2,Del_2p24.2,Del_2q37.3,Del_3p12.3,Del_4p16.3,Del_4q13.2,Del_4q35.1,Del_5q13.3,Del_5q23.1,Del_6p22.1,Del_6q22.31,Del_7p14.3,Del_8p23.3,Del_8p21.3,Del_9p21.3,Del_9q21.13,Del_9q22.1,Del_9q34.3,Del_10p15.3,Del_10q23.31,Del_10q26.3,Del_11p15.5,Del_11q25,Del_12p13.2,Del_12p11.21,Del_12q21.31,Del_13q14.2,Del_14q23.1,Del_14q24.3,Del_15q11.2,Del_16p13.3,Del_16q24.3,Del_17p13.3,Del_17p12,Del_18q22.1,Del_19p13.3,Del_19p12,Del_19q13.33,Del_20p13,Del_20q11.23,Del_21q11.2,Del_22q11.23,Del_22q13.1
SU2CLC-UCD-1124-T1,0.65788,0.021627,-0.027304,-0.027304,0.019269,0.74698,-0.27929,-0.009234,-0.009234,-0.009234,0.010249,0.010249,0.09315,0.011881,0.011881,-0.034544,-0.034544,-0.034544,-0.076044,1.7206,0.31405,-0.087366,0.015002,0.015002,-0.031433,-0.031433,0.021627,0.021627,-0.027304,0.019269,0.019269,0.019269,0.002244,0.002244,-0.27929,-0.27929,-0.009234,0.010249,0.010249,0.09315,0.09315,0.09315,0.09315,0.0,0.0,0.0,0.011881,0.011881,-0.034544,-0.034544,-0.034544,-0.076044,-0.41264,-0.41264,-0.22968,0.64458,-0.031352,-0.3377,-0.3377,0.14511,-0.087366,-0.087366,-0.087366,0.015002,0.015002,0.87333,-0.39298,-0.39298
SU2CLC-UCD-1145-T1,1.2013,-0.13687,0.080919,-0.45736,-0.28435,1.8686,-0.10409,0.54067,0.03095,0.49973,0.51409,0.51409,-0.050765,0.56191,0.24306,3.6569,-0.051119,-0.051119,0.12512,0.14644,-0.2645,0.60284,0.53842,0.70976,0.24376,0.098972,0.20607,0.026347,-0.43351,-0.28435,-0.28435,-0.41508,-0.3607,-0.3607,-0.10409,-0.10409,0.03095,-0.056476,0.0,-0.050765,-0.050765,-0.050765,-0.050765,-0.26828,0.094115,0.094115,0.59842,-0.21306,-0.14729,0.000191,-0.051119,-0.2245,0.14644,0.14644,0.43314,0.000395,-0.24188,-0.2825,-0.30291,0.78014,-0.16466,-0.2639,0.55253,-0.095104,0.21792,-0.2841,-0.28634,-0.28634
SU2CLC-UCD-1142-T1,0.0,0.062017,0.43839,0.43839,0.26872,-0.16568,-0.07289,0.33397,0.33397,-0.069965,-0.2333,0.28174,0.45899,-0.22517,0.26996,0.20237,0.20237,0.20237,-0.25417,0.019539,0.26302,-0.2079,0.32186,0.32186,-0.18803,0.0,0.062017,-0.17022,-0.23396,0.26872,0.26872,-0.21775,0.006308,0.006308,-0.026327,-0.24324,0.33397,-0.2333,-0.2333,0.45899,-0.22038,-0.22038,-0.17356,0.14628,0.44788,-0.20882,-0.22517,0.26996,-0.29222,0.20237,0.20237,-0.25417,0.019539,0.019539,-0.11443,0.025313,-0.11347,-0.19574,-0.19574,-0.24714,-0.2079,-0.2079,-0.2079,0.32186,0.32186,0.25456,-0.20061,-0.20061
SU2CLC-COL-1001-T1,0.000262,-0.013471,0.12883,0.12883,0.014176,-0.023801,0.027282,-0.001815,-0.001815,-0.001815,-0.004424,-0.004424,-0.094818,0.00317,0.00317,0.0,0.0,0.0,0.094834,-0.049747,0.019636,-0.10439,0.013893,0.013893,0.000262,0.000262,-0.013471,-0.013471,0.13463,0.014176,0.014176,0.014176,-0.023801,-0.023801,0.027282,-0.13096,-0.001815,-0.004424,-0.004424,-0.094818,-0.094818,-0.094818,-0.094818,-0.004232,-0.004232,0.031888,0.00317,0.00317,0.0,0.0,0.0,-0.10438,-0.049747,-0.049747,0.072121,0.016196,0.016196,0.019636,0.019636,0.010373,-0.10439,-0.10439,-0.10439,0.013893,0.013893,-0.056128,-0.11716,-0.11716
SU2CLC-MSK-A2001-T1,0.071134,0.0,-0.001657,-0.001657,-0.068309,0.091945,-0.099142,0.053321,0.053321,0.053321,-0.038626,-0.038626,-0.15018,-0.026788,0.003306,1.9e-05,1.9e-05,1.9e-05,-0.068194,0.67899,0.083857,-0.052048,0.056875,0.056875,-0.046049,0.071134,0.0,0.0,-0.001657,-0.068309,-0.068309,-0.068309,0.091945,0.091945,-0.099142,0.016569,0.053321,-0.038626,-0.038626,-0.15018,0.15507,0.15507,0.15507,-1.2929,0.048449,0.048449,-0.068266,-0.039113,1.9e-05,1.9e-05,1.9e-05,-0.068194,-0.089504,-0.089504,-0.091943,0.064602,0.01292,-0.12141,-0.12141,-0.16377,-0.052048,-0.052048,-0.052048,0.056875,0.056875,-0.13897,-0.074931,-0.074931


In [97]:
## Load data
su2c_cnv_arm = pd.read_csv(source_path_data_exome + "SU2C-MARK_Harmonized_Gistic_Arm_v1.txt", sep = "\t")
su2c_cnv_arm_t = su2c_cnv_arm.set_index("Chromosome Arm").T

In [98]:
## Check
print("Shape is: ", su2c_cnv_arm_t.shape)
display(su2c_cnv_arm_t.head())

Shape is:  (309, 39)


Chromosome Arm,1p,1q,2p,2q,3p,3q,4p,4q,5p,5q,6p,6q,7p,7q,8p,8q,9p,9q,10p,10q,11p,11q,12p,12q,13q,14q,15q,16p,16q,17p,17q,18p,18q,19p,19q,20p,20q,21q,22q
SU2CLC-UCD-1124-T1,0.0,0.658,0.0,0.0,0.0,0.0,0.0,0.0,0.747,0.0,-0.279,-0.279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.37,-0.336,0.229,0.0,-0.338,0.314,0.145,0.145,0.0,0.0,0.0,0.0,0.346,-0.393
SU2CLC-UCD-1145-T1,0.244,0.388,0.0,0.0,-0.434,-0.434,-0.284,-0.284,1.019,-0.361,-0.104,-0.104,0.0,0.469,0.0,0.514,0.0,0.0,0.0,0.0,0.134,0.134,0.0,0.0,-0.224,0.146,-0.128,0.0,0.0,-0.283,0.0,-0.3,-0.3,-0.189,0.363,0.538,0.538,-0.22,-0.286
SU2CLC-UCD-1142-T1,0.0,0.0,0.0,0.0,0.0,0.0,0.269,0.269,0.0,0.0,-0.243,-0.243,0.245,0.245,0.282,0.282,0.459,-0.22,0.146,0.146,-0.225,0.27,0.0,0.202,-0.254,0.0,-0.114,0.0,0.0,-0.196,0.263,-0.247,-0.247,-0.208,-0.208,0.322,0.322,0.255,-0.201
SU2CLC-COL-1001-T1,0.0,0.0,0.0,0.0,-0.184,-0.184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.104,-0.104,0.0,0.0,0.0,-0.117
SU2CLC-MSK-A2001-T1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.15,0.155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.139,0.0


In [99]:
## Load total data
su2c_cnv_totals = pd.read_csv(source_path_data_exome + "SU2C-MARK_Harmonized_Total_Amps_Dels_v1.txt", sep = "\t")

In [100]:
## Check
print("Shape is: ", su2c_cnv_totals.shape)
display(su2c_cnv_totals.head())

Shape is:  (309, 3)


Unnamed: 0,Harmonized_SU2C_WES_Tumor_Sample_ID_v2,Total_amps,Total_dels
0,SU2CLC-UCD-1124-T1,5.0,9.0
1,SU2CLC-UCD-1145-T1,17.0,20.0
2,SU2CLC-UCD-1142-T1,14.0,24.0
3,SU2CLC-COL-1001-T1,2.0,7.0
4,SU2CLC-MSK-A2001-T1,1.0,6.0


In [101]:
## Load summary
su2c_hla_summary = pd.read_csv(source_path_data_exome + "SU2C-MARK_Harmonized_Antigen_Presentation_v1.txt", sep = "\t")

In [102]:
## Check
print("Shape is: ", su2c_hla_summary.shape)
display(su2c_hla_summary.head())

Shape is:  (309, 4)


Unnamed: 0,Harmonized_SU2C_WES_Tumor_Sample_ID_v2,HLA_LOH_present,HLA_hom_present,B2M_altered
0,SU2CLC-CLE-NIVO1-T1,0,0,0
1,SU2CLC-CLE-NIVO18-T1,0,0,0
2,SU2CLC-CLE-NIVO19-T1,0,0,0
3,SU2CLC-CLE-NIVO3-T1,0,0,0
4,SU2CLC-CLE-NIVO4-T1,0,0,0


In [103]:
## Load Data
su2c_wes_mixcr_harm = pd.read_csv(source_path_data_exome + "SU2C-MARK_Harmonized_Mixcr_v1.txt", sep = "\t")

In [104]:
## Check
print("Shape is: ", su2c_wes_mixcr_harm.shape)
display(su2c_wes_mixcr_harm.head())

Shape is:  (309, 5)


Unnamed: 0,Harmonized_SU2C_WES_Tumor_Sample_ID_v2,log_DNA_BCR_burden,log_DNA_TCR_burden,DNA_BCR_entropy,DNA_TCR_entropy
0,SU2CLC-CLE-NIVO1-T1,0.006969,0.012692,1.33,0.76
1,SU2CLC-CLE-NIVO18-T1,0.026353,0.03415,1.67,1.89
2,SU2CLC-CLE-NIVO19-T1,0.011776,0.020402,1.49,1.91
3,SU2CLC-CLE-NIVO3-T1,0.0,0.004533,0.0,0.0
4,SU2CLC-CLE-NIVO4-T1,0.004782,0.011077,1.01,2.11


In [105]:
## Load Data
su2c_wes_abs_harm = pd.read_csv(source_path_data_exome + "SU2C-MARK_Harmonized_Absolute_Purity_Ploidy_v1.txt", sep = "\t")

In [106]:
## Check
print("Shape is: ", su2c_wes_abs_harm.shape)
display(su2c_wes_abs_harm.head())

Shape is:  (309, 3)


Unnamed: 0,Harmonized_SU2C_WES_Tumor_Sample_ID_v2,Ploidy,Purity
0,SU2CLC-CLE-NIVO1-T1,1.96,0.57
1,SU2CLC-CLE-NIVO18-T1,5.22,0.25
2,SU2CLC-CLE-NIVO19-T1,2.1,0.37
3,SU2CLC-CLE-NIVO3-T1,2.02,0.93
4,SU2CLC-CLE-NIVO4-T1,1.84,0.62


#### Read RNA Data

In [107]:
## Load Data
su2c_bnmf_harm = pd.read_csv(source_path_data_rna + "SU2C-MARK_Harmonized_Tumor_Clusters_v1.txt", sep = "\t")
## Check
print("Shape is: ", su2c_bnmf_harm.shape)
display(su2c_bnmf_harm.head())

Shape is:  (153, 10)


Unnamed: 0,Harmonized_SU2C_RNA_Tumor_Sample_ID_v2,Tumor_cluster,T1_strength,T2_strength,T3_strength,T4_strength,T1_norm,T2_norm,T3_norm,T4_norm
0,SU2CLC-CLE-NIVO10-T1,1,0.154,0.03,0.088,0.0,0.567,0.11,0.323,0.0
1,SU2CLC-CLE-NIVO18-T1,2,0.0,0.375,0.0,0.088,0.0,0.81,0.0,0.19
2,SU2CLC-CLE-NIVO19-T1,1,0.489,0.372,0.132,0.004,0.49,0.374,0.133,0.004
3,SU2CLC-CLE-NIVO2-T1,4,0.0,0.183,0.317,0.505,0.0,0.182,0.316,0.503
4,SU2CLC-CLE-NIVO20-T1,1,0.661,0.068,0.182,0.048,0.689,0.071,0.189,0.05


In [108]:
## Load Data
su2c_ssbnmf_harm = pd.read_csv(source_path_data_rna + "SU2C-MARK_Harmonized_Integrative_Clusters_v1.txt", sep = "\t")
## Check
print("Shape is: ", su2c_ssbnmf_harm.shape)
display(su2c_ssbnmf_harm.head())

Shape is:  (153, 8)


Unnamed: 0,Harmonized_SU2C_RNA_Tumor_Sample_ID_v2,Integrative_cluster,I1_strength,I2_strength,I3_strength,I1_norm,I2_norm,I3_norm
0,SU2CLC-CLE-NIVO10-T1,3,0.107,0.025,0.118,0.428,0.101,0.47
1,SU2CLC-CLE-NIVO18-T1,2,0.02,0.483,0.0,0.04,0.96,0.0
2,SU2CLC-CLE-NIVO19-T1,2,0.0,0.795,0.0,0.0,1.0,0.0
3,SU2CLC-CLE-NIVO2-T1,3,0.006,0.027,0.139,0.036,0.156,0.808
4,SU2CLC-CLE-NIVO20-T1,2,0.0,0.792,0.0,0.0,1.0,0.0


In [109]:
## Load Data
su2c_is_hm_harm = pd.read_csv(source_path_data_rna + "SU2C-MARK_Harmonized_Curated_Sets_HM_v1.txt", sep = "\t")
## Check
print("Shape is: ", su2c_is_hm_harm.shape)
display(su2c_is_hm_harm.head())

Shape is:  (152, 8)


Unnamed: 0,Harmonized_SU2C_RNA_Tumor_Sample_ID_v2,Adenosine (Corvus),Antigen processing machinery (PMID: 27855702),EMT2 (PMID: 27321955),IFNG,Merck/Nanostring 18 gene T cell–inflamed GEP score,NFAT/NR4A1 family T cell dysfunction,TGF-B (Mariathasan Nature 2018)
0,SU2CLC-CLE-NIVO10-T1,-0.34722,-1.781804,-1.511646,-2.566604,-1.651118,-1.074415,-1.277529
1,SU2CLC-CLE-NIVO18-T1,-0.408656,-0.178822,-0.665314,0.322697,-0.179835,-0.578566,-0.553323
2,SU2CLC-CLE-NIVO19-T1,0.670662,0.791995,0.439311,1.339746,0.789172,0.343941,0.595709
3,SU2CLC-CLE-NIVO2-T1,-0.675524,1.059913,0.732698,0.712259,1.031519,1.181614,0.169688
4,SU2CLC-CLE-NIVO20-T1,0.590495,0.876095,-1.018308,0.845423,2.206946,0.182293,-1.137497


In [110]:
## Load Data
su2c_is_dh_harm = pd.read_csv(source_path_data_rna + "SU2C-MARK_Harmonized_Curated_Sets_DH_v1.txt", sep = "\t")
## Check
print("Shape is: ", su2c_is_dh_harm.shape)
display(su2c_is_dh_harm.head())

Shape is:  (152, 15)


Unnamed: 0,Harmonized_SU2C_RNA_Tumor_Sample_ID_v2,B-cells,CD45,CD8 T cells,Cytotoxic cells,DC,Exhausted CD8,Macrophages,Mast cells,Neutrophils,NK CD56dim cells,NK cells,T-cells,Th1 cells,Treg
0,SU2CLC-CLE-NIVO10-T1,-1.20664,-1.168817,-0.312651,-1.818944,-1.751334,-2.229064,-0.770359,-1.133422,-0.629028,-1.843729,-1.627241,-1.332082,-1.610568,-2.223001
1,SU2CLC-CLE-NIVO18-T1,0.219801,0.088713,-0.246769,-0.745221,0.641015,-0.402675,-0.134936,-0.407499,-1.166153,-0.320157,-0.575348,0.312373,-0.036421,0.985207
2,SU2CLC-CLE-NIVO19-T1,0.483138,-0.17636,0.259516,0.160674,0.756281,0.478177,-0.097389,-0.375951,-0.558983,0.78646,-0.463227,0.048374,0.216316,0.538209
3,SU2CLC-CLE-NIVO2-T1,0.206836,0.146816,1.04791,0.44444,0.236922,0.788992,0.614575,-1.097114,0.16112,0.551745,0.556051,0.540561,0.858795,0.002414
4,SU2CLC-CLE-NIVO20-T1,0.610164,0.222164,2.591455,2.203856,-0.176264,2.585199,0.522922,-1.060015,-0.491975,1.089008,1.195594,1.453646,1.429502,1.217274


In [111]:
## Load Data
su2c_is_sf_harm = pd.read_csv(source_path_data_rna + "SU2C-MARK_Harmonized_Curated_Sets_SF_v1.txt", sep = "\t")
## Check
print("Shape is: ", su2c_is_sf_harm.shape)
display(su2c_is_sf_harm.head())

Shape is:  (152, 12)


Unnamed: 0,Harmonized_SU2C_RNA_Tumor_Sample_ID_v2,B-cells,Cytotoxic cells,DC,Exhausted CD8,Exhausted/HS CD8,Lymphocytes,Lymphocytes exhausted/cell cycle,Macrophages/Monocytes,Memory T cells,Plasma,Treg
0,SU2CLC-CLE-NIVO10-T1,-1.259768,-0.842559,-0.433253,-1.422601,-1.774418,-1.350288,-2.821135,-0.297431,-2.075795,-1.973561,-2.756701
1,SU2CLC-CLE-NIVO18-T1,0.710736,0.063517,0.001486,0.039338,-0.141059,0.250194,0.14743,-0.288875,0.147272,1.199513,0.686839
2,SU2CLC-CLE-NIVO19-T1,0.525296,0.338409,0.408307,0.465818,0.409422,-0.066813,0.698741,-0.443927,-0.243725,0.313025,0.444627
3,SU2CLC-CLE-NIVO2-T1,0.768654,0.997207,-0.068318,0.931905,0.860628,1.111493,0.529933,-0.186005,0.663986,0.181901,0.321043
4,SU2CLC-CLE-NIVO20-T1,0.698067,2.250859,-0.619655,2.899739,2.006238,1.680539,2.192263,-0.46479,0.364394,-0.104678,0.926042


In [112]:
## Load Data
su2c_is_zi_harm = pd.read_csv(source_path_data_rna + "SU2C-MARK_Harmonized_Curated_Sets_ZI_v1.txt", sep = "\t")
## Check
print("Shape is: ", su2c_is_zi_harm.shape)
display(su2c_is_zi_harm.head())

Shape is:  (152, 11)


Unnamed: 0,Harmonized_SU2C_RNA_Tumor_Sample_ID_v2,hMø1,hMø4,hMø5,hMø6,hMø7,hMø8,hMø9,hMono1,hMono2,hMono3
0,SU2CLC-CLE-NIVO10-T1,-1.109117,-1.156333,-2.190216,-1.200923,-1.991091,-0.69999,-0.672497,-2.092137,-2.369909,-0.657237
1,SU2CLC-CLE-NIVO18-T1,0.789262,-0.24114,-0.269748,0.133515,-0.600769,-0.509239,-0.120329,-0.341126,0.238928,-0.473816
2,SU2CLC-CLE-NIVO19-T1,0.766106,1.190017,-0.606071,0.155629,-0.065903,-0.128578,1.258769,-0.274179,-0.066995,0.329074
3,SU2CLC-CLE-NIVO2-T1,-0.930557,0.196371,1.80135,-0.306568,-0.883556,2.732102,1.364896,-0.042298,-0.055184,-0.387409
4,SU2CLC-CLE-NIVO20-T1,0.280883,-0.904412,-0.644487,0.228805,-0.496632,-0.017237,1.022745,0.450452,1.109786,-0.185438


In [113]:
## Load Data
su2c_is_zi_ext_harm = pd.read_csv(source_path_data_rna + "SU2C-MARK_Harmonized_Curated_Sets_ZI_Extended_v1.txt", sep = "\t")
## Check
print("Shape is: ", su2c_is_zi_ext_harm.shape)
display(su2c_is_zi_ext_harm.head())

Shape is:  (152, 13)


Unnamed: 0,Harmonized_SU2C_RNA_Tumor_Sample_ID_v2,hN1,hN2,hN3,hN5,hDC1,hDC2,hDC3,hpDC,hMast1,hMast2,hT1,hB
0,SU2CLC-CLE-NIVO10-T1,0.661147,-2.14522,-1.628139,-0.734098,-1.319504,-1.62389,-3.014243,-1.367851,-3.070258,-1.403527,-2.090289,-1.497674
1,SU2CLC-CLE-NIVO18-T1,-1.713392,0.527368,-0.722902,-0.381433,-0.283341,-1.638975,-0.397212,-0.145566,0.997473,-0.732677,-0.159027,-0.176729
2,SU2CLC-CLE-NIVO19-T1,-0.109088,2.042201,0.414281,-0.651719,-0.013068,0.592631,0.504079,0.551709,-0.312533,0.795462,0.512162,0.489272
3,SU2CLC-CLE-NIVO2-T1,0.356693,0.243782,0.802809,-0.332503,0.312384,0.651996,0.823568,1.290402,0.151119,1.524649,1.160909,0.155196
4,SU2CLC-CLE-NIVO20-T1,0.075448,-0.281595,-0.529214,-0.571519,-0.604111,-0.802896,0.040392,0.199211,0.340869,-1.427449,1.80006,0.516564


In [117]:
## Load Data
su2c_rna_harm = pd.read_csv(source_path_data_rna + "SU2C-MARK_Harmonized_rnaseqc_tpm_v1.gct", skiprows = 2, sep = "\t")
## Check
print("Shape is: ", su2c_rna_harm.shape)
display(su2c_rna_harm.head())

Shape is:  (57523, 154)


Unnamed: 0,Name,Description,SU2CLC-CLE-NIVO10-T1,SU2CLC-CLE-NIVO18-T1,SU2CLC-CLE-NIVO19-T1,SU2CLC-CLE-NIVO2-T1,SU2CLC-CLE-NIVO20-T1,SU2CLC-CLE-NIVO21-T1,SU2CLC-CLE-NIVO24-T1,SU2CLC-CLE-NIVO3-T1,SU2CLC-CLE-NIVO31-T1,SU2CLC-CLE-NIVO47-T1,SU2CLC-CLE-NIVO5-T1,SU2CLC-CLE-NIVO52-T1,SU2CLC-CLE-NIVO54-T1,SU2CLC-CLE-NIVO61-T1,SU2CLC-CLE-NIVO64-T1,SU2CLC-CLE-NIVO65-T1,SU2CLC-CLE-NIVO9-T1,SU2CLC-COL-1001-T1,SU2CLC-COL-1004-T1,SU2CLC-COL-1005-T1,SU2CLC-COL-1007-T1,SU2CLC-COL-1008-T1,SU2CLC-COL-1010-T1,SU2CLC-COL-1016-T1,SU2CLC-COL-1017-T1,SU2CLC-COL-1018-T1,SU2CLC-COL-1020-T1,SU2CLC-COL-1021-T1,SU2CLC-COL-1022-T1,SU2CLC-COL-1023-T1,SU2CLC-COL-1025-T1,SU2CLC-COL-1026-T1,SU2CLC-COL-1027-T1,SU2CLC-COL-1029-T1,SU2CLC-COL-1031-T1,SU2CLC-COL-1032-T1,SU2CLC-COL-1033-T1,SU2CLC-COL-1034-T1,SU2CLC-COL-1035-T1,SU2CLC-COL-1036-T1,SU2CLC-COL-1037-T1,SU2CLC-COL-1038-T1,SU2CLC-COL-1039-T2,SU2CLC-COL-1041-T1,SU2CLC-COL-1043-T2,SU2CLC-COL-1044-T1,SU2CLC-COL-1047-T1,SU2CLC-DFC-1001-T1,SU2CLC-DFC-1002-T1,SU2CLC-DFC-1003-T1,SU2CLC-DFC-1004-T1,SU2CLC-DFC-1007-T1,SU2CLC-DFC-1012-T1,SU2CLC-DFC-1013-T1,SU2CLC-DFC-1015-T2,SU2CLC-DFC-1016-T1,SU2CLC-DFC-1017-T2,SU2CLC-DFC-1018-T1,SU2CLC-DFC-1019-T1,SU2CLC-DFC-1020-T1,SU2CLC-DFC-1534-T1,SU2CLC-DFC-1535-T1,SU2CLC-DFC-1536-T1,SU2CLC-DFC-1537-T1,SU2CLC-DFC-1538-T1,SU2CLC-DFC-1539-T1,SU2CLC-DFC-DF0032-T1,SU2CLC-DFC-DF0033-T1,SU2CLC-DFC-DF0047-T1,SU2CLC-DFC-DF0107-T1,SU2CLC-DFC-DF0108-T1,SU2CLC-DFC-DF0109-T1,SU2CLC-DFC-DF0112-T1,SU2CLC-DFC-DF0241-T1,SU2CLC-DFC-DF0499-T1,SU2CLC-DFC-DF0510-T1,SU2CLC-DFC-DF0512-T1,SU2CLC-DFC-DF0561-T1,SU2CLC-DFC-DF0668-T1,SU2CLC-DFC-DF0790-T1,SU2CLC-DFC-DF0840-T1,SU2CLC-MDA-1441-T1,SU2CLC-MDA-1442-T1,SU2CLC-MDA-1443-T1,SU2CLC-MDA-1444-T1,SU2CLC-MDA-1561-T1,SU2CLC-MDA-1562-T1,SU2CLC-MDA-1563-T1,SU2CLC-MDA-1564-T1,SU2CLC-MDA-1627-T1,SU2CLC-MDA-1628-T1,SU2CLC-MDA-1629-T1,SU2CLC-MDA-1630-T1,SU2CLC-MDA-1631-T1,SU2CLC-MGH-1044-T1,SU2CLC-MGH-1054-T2,SU2CLC-MGH-1055-T1,SU2CLC-MGH-1135-T2,SU2CLC-MGH-1148-T1,SU2CLC-MGH-1149-T1,SU2CLC-MGH-1150-T1,SU2CLC-MGH-1151-T1,SU2CLC-MGH-1158-T1,SU2CLC-MGH-1161-T2,SU2CLC-MGH-1163-T1,SU2CLC-MGH-1169-T1,SU2CLC-MGH-1387-T1,SU2CLC-MGH-1388-T1,SU2CLC-MGH-1389-T1,SU2CLC-MGH-1409-T1,SU2CLC-MGH-1411-T1,SU2CLC-MGH-1412-T1,SU2CLC-MGH-1413-T1,SU2CLC-MGH-1414-T1,SU2CLC-MGH-1415-T1,SU2CLC-MGH-1416-T1,SU2CLC-MGH-1417-T1,SU2CLC-MGH-1418-T1,SU2CLC-MGH-1487-T1,SU2CLC-MGH-1488-T1,SU2CLC-MGH-1489-T1,SU2CLC-MGH-1490-T1,SU2CLC-MGH-1492-T1,SU2CLC-MGH-1493-T1,SU2CLC-MGH-1495-T1,SU2CLC-MGH-1498-T1,SU2CLC-MGH-1499-T1,SU2CLC-MGH-1500-T1,SU2CLC-MGH-1501-T1,SU2CLC-MGH-1503-T1,SU2CLC-MGH-1565-T1,SU2CLC-MGH-1567-T1,SU2CLC-MGH-1568-T1,SU2CLC-MGH-1572-T1,SU2CLC-MGH-1573-T1,SU2CLC-MGH-1574-T1,SU2CLC-MGH-1575-T1,SU2CLC-MGH-1576-T1,SU2CLC-MGH-1577-T1,SU2CLC-MSK-1364-T1,SU2CLC-MSK-1365-T1,SU2CLC-MSK-A2009-T1,SU2CLC-MSK-A2013-T1,SU2CLC-MSK-A2014-T1,SU2CLC-MSK-A2060-T1,SU2CLC-MSK-A2075-T1,SU2CLC-UCD-1124-T1,SU2CLC-UCD-1137-T1,SU2CLC-UCD-1142-T1,SU2CLC-UCD-1143-T1,SU2CLC-UCD-1557-T1,SU2CLC-UCD-1560-T1
0,ENSG00000223972.5,DDX11L1,0.0,0.0,0.465325,0.663131,0.038447,0.0,0.036191,0.083753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051068,0.06249,0.215254,0.0,0.0,0.010361,0.022345,0.102075,0.177206,0.0,0.0,0.0,0.0,0.0,0.04839,0.232023,0.095474,0.021276,0.0,0.0,0.305082,0.104387,0.146697,0.0,0.052629,0.0,0.0,0.0,0.0,0.267289,0.0,0.039788,0.0,0.0,0.019366,0.076655,1.82734,0.0,0.015823,0.265941,0.0,0.082487,0.062776,0.077214,0.126003,0.042029,0.0,0.590339,0.014534,0.246453,0.14726,0.026488,2.09573,0.198775,0.105858,0.100894,0.291247,1.174,0.0,0.30414,0.0,0.058379,0.082171,0.088797,0.303429,0.026011,0.0,0.039143,0.048613,0.184466,0.035506,0.037625,0.0,0.0,0.162518,0.235454,0.315222,0.242946,0.0,0.010428,0.0,0.0,0.113742,0.357453,0.0,0.0,0.415394,0.0,0.0,0.546949,0.063254,0.0,0.0,0.082087,0.064683,0.019833,0.0,0.01934,0.0,0.177929,0.252702,0.0,0.0,0.293536,0.0,0.0,0.011045,0.048905,0.026322,0.092582,0.627773,0.007175,0.0,0.156942,0.017123,0.013185,0.030421,0.0,0.0,0.016853,0.216556,0.28422,0.073485,0.0,0.0,0.0,0.057543,0.009227,0.0,0.056734,0.018465,0.507545,0.02658,0.236091,0.0,0.15309
1,ENSG00000227232.5,WASH7P,1.72811,13.2271,17.2261,9.34412,10.0231,44.2401,0.511251,9.40064,0.974661,0.18821,3.28665,0.078204,9.96455,1.97826,0.048932,3.66674,3.54148,5.39694,11.016,3.4807,2.93694,14.8632,8.66622,6.10497,4.62261,0.316061,1.96025,15.4649,18.9668,16.2595,4.79543,5.14829,3.41776,5.57385,0.820819,22.3431,4.20866,7.31727,4.82496,12.9355,9.64823,7.17072,26.0181,20.5151,7.39817,6.21194,14.1718,2.35048,11.1866,6.1982,5.24774,7.80157,7.79942,10.9583,4.47044,8.30346,8.53798,8.68651,11.7973,11.1557,3.07454,13.08,6.97064,5.6481,2.31442,9.45786,16.264,4.7284,13.393,12.472,10.0147,15.4708,13.9887,12.3487,4.22703,3.69037,9.72565,2.27414,5.25523,10.1112,5.35152,8.91901,0.800251,3.34793,8.4906,13.1986,4.05822,4.85614,0.135781,33.7534,8.96358,10.0037,3.46988,5.78313,17.2424,4.47277,0.0,6.07485,7.27113,5.16434,1.21686,7.49364,6.18331,6.82442,4.86544,15.2651,2.437,0.0,17.6484,3.04396,2.70804,2.7253,3.20884,6.58175,4.33297,5.31267,4.54339,5.792,0.0,4.95859,0.206602,7.95294,2.12757,5.28827,4.49586,7.51428,10.291,17.516,2.23753,3.51938,6.51995,4.36851,9.29804,9.08591,8.82011,7.29376,6.65042,4.42873,6.58248,6.46323,3.80244,2.17209,9.95161,1.31537,1.37496,7.06737,6.4502,5.49124,4.19865,3.71415,55.4283,4.32528
2,ENSG00000278267.1,MIR6859-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSG00000243485.5,MIR1302-2HG,0.0,0.0,0.0,0.0,0.0,0.0,0.215248,0.0,0.429895,0.027671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035687,0.0,0.0,0.025079,0.0,0.0,0.0,0.0,0.0,0.0,0.059962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03189,0.0,0.0,0.0,0.016099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188272,0.047259,0.0,0.0,0.0,0.0
4,ENSG00000284332.1,MIR1302-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
## The data represent genetix expression, so it makes sense that all values within the dataset are positive,
## because a gene can be expressed (positive value) or unexpressed (value equal to 0).
## Let's do this check.
if (su2c_rna_harm.iloc[:, 2:]  >= 0).all().all(): ## iloc[:, 2:] exclude the first 2 columns that are expressed in string format
    print("All values are greater than or equal to 0")

## The su2c_rna_harm dataset contains the RNA-seq already transformed with log2foldchange

All values are greater than or equal to 0


In [116]:
## Load Data
su2c_limma_harm = pd.read_csv(source_path_data_rna + "SU2C-MARK_Harmonized_Limma_All_v1.txt", sep = "\t")
## Check
print("Shape is: ", su2c_limma_harm.shape)
display(su2c_limma_harm.head())

Shape is:  (15924, 12)


Unnamed: 0.1,Unnamed: 0,logFC,AveExpr,t,P.Value,adj.P.Val,B,ensembl_gene_id_version,hgnc_symbol,median_log2tpm,gexp_gt1_percent,gexp_gt1_cat
0,1,-0.40337,1.093088,-1.635074,0.104315,0.403476,-4.315271,ENSG00000187634.11,SAMD11,1.117063,0.785124,50-80%
1,2,0.088979,5.565879,1.021167,0.308964,0.632204,-5.437019,ENSG00000188976.10,NOC2L,5.31183,0.983471,80-100%
2,3,-0.019629,3.103165,-0.153923,0.877896,0.954957,-5.669993,ENSG00000187961.13,KLHL17,3.266475,0.983471,80-100%
3,4,0.194845,2.217655,0.922449,0.357906,0.669481,-5.190195,ENSG00000187583.10,PLEKHN1,2.603013,0.958678,80-100%
4,5,0.110187,-1.277853,0.36814,0.713334,0.883703,-5.027598,ENSG00000187642.9,PERM1,0.504215,0.512397,50-80%


In [77]:
## Load Data
su2c_tme_gsea_harm  = pd.read_csv(source_path_data_rna + "SU2C-MARK_TME_Cluster_GSEA.tsv", sep = "\t")
## Check
print("Shape is: ", su2c_tme_gsea_harm.shape)
display(su2c_tme_gsea_harm.head())

Shape is:  (13, 8)


Unnamed: 0,Gene Set Name,# Genes in Gene Set (K),Description,# Genes in Overlap (k),k/K,p-value,FDR q-value,Set
0,HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,200,Genes defining epithelial-mesenchymal transiti...,3,0.015,0.000421,0.0105,G1
1,HALLMARK_MYOGENESIS,200,Genes involved in development of skeletal musc...,3,0.015,0.000421,0.0105,G1
2,HALLMARK_ALLOGRAFT_REJECTION,200,Genes up-regulated during transplant rejection.,12,0.06,1.0399999999999999e-20,5.18e-19,G2
3,HALLMARK_IL6_JAK_STAT3_SIGNALING,87,Genes up-regulated by IL6 [GeneID=3569] via ST...,5,0.0575,5.22e-09,1.12e-07,G2
4,HALLMARK_INTERFERON_GAMMA_RESPONSE,200,Genes up-regulated in response to IFNG [GeneID...,6,0.03,6.72e-09,1.12e-07,G2


#### Merge Dataset

In [None]:
## Merge dataset
su2c_merge_master = clinical_data.merge(clinical_data_extra,\
        left_on = "Harmonized_SU2C_Participant_ID_v2", right_on = "Harmonized_SU2C_Participant_ID_v2", how = "inner")\
    .merge(su2c_val_harm_dam_pv_drivers.rename(columns = lambda x: x + "_MUT"),\
        left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_on = "Tumor_Sample_Barcode_MUT", how = "left").drop("Tumor_Sample_Barcode_MUT", axis = 1)\
    .merge(su2c_cnv_gene_sig_t.rename(columns = lambda x: x + "_CNV"),\
        left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_index = True, how = "left")\
    .merge(su2c_cnv_peak_actual_t,\
        left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_index = True, how = "left")\
    .merge(su2c_cnv_arm_t,\
        left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_index = True, how = "left")\
    .merge(su2c_cnv_totals,\
        left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", how = "left")\
    .merge(su2c_tmb_sig_tmb_harm,\
        left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", how = "left")\
    .merge(su2c_tmb_sig_tmb_harm_log,\
        left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_index = True, how = "left")\
    .merge(su2c_hla_summary,\
        left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", how = "left")\
    .merge(su2c_wes_mixcr_harm,\
        left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", how = "left")\
    .merge(su2c_wes_abs_harm,\
          left_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_WES_Tumor_Sample_ID_v2", how = "left")\
    .merge(su2c_bnmf_harm,\
          left_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2", how = "left")\
    .merge(su2c_ssbnmf_harm,\
          left_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2", how = "left")\
    .merge(su2c_is_hm_harm.rename(columns = lambda x: x + "_HM"),\
          left_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_HM", how = "left")\
          .drop("Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_HM", axis = 1)\
    .merge(su2c_is_dh_harm.rename(columns = lambda x: x + "_DH"),\
          left_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_DH", how = "left")\
          .drop("Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_DH", axis = 1)\
    .merge(su2c_is_sf_harm.rename(columns = lambda x: x + "_SF"),\
          left_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_SF", how = "left")\
          .drop("Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_SF", axis = 1)\
    .merge(su2c_is_zi_harm.rename(columns = lambda x: x + "_ZI"),\
          left_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_ZI", how = "left")\
          .drop("Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_ZI", axis = 1)\
    .merge(su2c_is_zi_ext_harm.rename(columns = lambda x: x + "_ZI"),\
          left_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2", right_on = "Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_ZI", how = "left")\
          .drop("Harmonized_SU2C_RNA_Tumor_Sample_ID_v2_ZI", axis = 1)

In [None]:
## Apply Log transform
su2c_merge_master["log_TMB"] = su2c_merge_master.dropna(subset = ["TMB"])["TMB"].apply(lambda x: np.log10((x+1)/33))

In [None]:
## Check
print("Shape is: ", su2c_merge_master.shape)
display(su2c_merge_master)

Shape is:  (393, 353)


Unnamed: 0,WES_Cohort_1,WES_Cohort_2,WES_All,RNA_Cohort_1,RNA_Cohort_2,RNA_All,Institution,Harmonized_SU2C_Participant_ID_v2,Harmonized_SU2C_WES_Tumor_Sample_ID_v2,Harmonized_SU2C_WES_Normal_Sample_ID_v2,Harmonized_SU2C_RNA_Tumor_Sample_ID_v2,Pre-treatment_RNA_Sample_QC,Patient_Age_at_Diagnosis,Patient_Sex,Patient_Race,Patient_Smoking_Status,Patient_Smoking_Pack_Years_Harmonized,Histology_Harmonized,Histology_Detail,Initial_Stage,Initial_Stage_Substage,PDL1_TPS,PDL1_TPS_Description,Local_Antibody_Clone,Clinical_Driver,Sequencing_Platform,Advanced_Diagnosis_Date,Line_of_Therapy,Agent_PD1,Agent_PD1_Category,Prior_Platinum,Prior_TKI,Harmonized_PFS_Event,Harmonized_PFS_Days,Harmonized_Confirmed_BOR,Harmonized_BOR_RECIST,Harmonized_OS_Event,Harmonized_OS_Days,Harmonized_Confirmed_BOR_Bin,Harmonized_Confirmed_BOR_3_Cat,Harmonized_PFS_Months,Harmonized_OS_Months,Advanced_Diagnosis_Date_Positive,Patient_Smoking_Pack_Years_Harmonized_Log,APC_MUT,ARHGAP35_MUT,ARID1A_MUT,ATF7IP_MUT,ATM_MUT,B2M_MUT,BRAF_MUT,CDKN2A_MUT,CMTR2_MUT,COL5A2_MUT,CREBBP_MUT,CTNNB1_MUT,CUL3_MUT,DSN1_MUT,EGFR_MUT,ELL2_MUT,EP300_MUT,ERBB2_MUT,FANCM_MUT,FAT1_MUT,FBXW7_MUT,HLA-A_MUT,HRAS_MUT,ITGBL1_MUT,KEAP1_MUT,KLF5_MUT,KLHL5_MUT,KMT2C_MUT,KMT2D_MUT,KRAS_MUT,LATS1_MUT,MET_MUT,MGA_MUT,NCOA6_MUT,NF1_MUT,NFE2L2_MUT,NOTCH1_MUT,NRAS_MUT,PIK3CA_MUT,PLXNB2_MUT,PTEN_MUT,RASA1_MUT,RB1_MUT,RBM10_MUT,RIT1_MUT,SETD2_MUT,SMAD4_MUT,SMARCA4_MUT,STK11_MUT,TP53_MUT,U2AF1_MUT,ZFP36L1_MUT,MYCL_CNV,MCL1_CNV,BCL11A_CNV,REL_CNV,NFE2L2_CNV,FOXP1_CNV,ROBO1_CNV,MECOM_CNV,TERC_CNV,SOX2_CNV,PDGFRA_CNV,KIT_CNV,KDR_CNV,FAT1_CNV,TERT_CNV,CCND3_CNV,PTP4A1_CNV,PHF3_CNV,EGFR_CNV,CDK6_CNV,MET_CNV,KMT2C_CNV,FGFR1_CNV,KAT6A_CNV,MYC_CNV,CDKN2A_CNV,ZMYND11_CNV,PTEN_CNV,CCND1_CNV,KDM5A_CNV,KRAS_CNV,CDK4_CNV,MDM2_CNV,RB1_CNV,TRAF3_CNV,B2M_CNV,IGF1R_CNV,CREBBP_CNV,USP22_CNV,NF1_CNV,ERBB2_CNV,TUBD1_CNV,SMAD4_CNV,CCNE1_CNV,BCL2L1_CNV,ZNF217_CNV,MAPK1_CNV,Amp_1q21.3,Amp_2p16.1,Amp_3q26.2,Amp_3q27.1,Amp_4q12,Amp_5p15.33,Amp_6p21.1,Amp_7p15.3,Amp_7p11.2,Amp_7q21.12,Amp_8p11.23,Amp_8q24.21,Amp_9p13.3,Amp_11p13,Amp_11q13.3,Amp_12p12.1,Amp_12q14.1,Amp_12q15,Amp_13q34,Amp_14q13.3,Amp_17q24.2,Amp_19q12,Amp_20q11.21,Amp_20q13.33,Del_1p36.12,Del_1p13.2,Del_2p24.2,Del_2q37.3,Del_3p12.3,Del_4p16.3,Del_4q13.2,Del_4q35.1,Del_5q13.3,Del_5q23.1,Del_6p22.1,Del_6q22.31,Del_7p14.3,Del_8p23.3,Del_8p21.3,Del_9p21.3,Del_9q21.13,Del_9q22.1,Del_9q34.3,Del_10p15.3,Del_10q23.31,Del_10q26.3,Del_11p15.5,Del_11q25,Del_12p13.2,Del_12p11.21,Del_12q21.31,Del_13q14.2,Del_14q23.1,Del_14q24.3,Del_15q11.2,Del_16p13.3,Del_16q24.3,Del_17p13.3,Del_17p12,Del_18q22.1,Del_19p13.3,Del_19p12,Del_19q13.33,Del_20p13,Del_20q11.23,Del_21q11.2,Del_22q11.23,Del_22q13.1,1p,1q,2p,2q,3p,3q,4p,4q,5p,5q,6p,6q,7p,7q,8p,8q,9p,9q,10p,10q,11p,11q,12p,12q,13q,14q,15q,16p,16q,17p,17q,18p,18q,19p,19q,20p,20q,21q,22q,Total_amps,Total_dels,TMB,TMB_clonal,TMB_subclonal,TMB_indel,Neoantigens,Neoantigens_clonal,Neoantigens_subclonal,Subclone_count,Aging_Signature,Smoking_Signature,APOBEC_Signature,log_TMB,log_TMB_clonal,log_TMB_subclonal,log_TMB_indel,log_Neoantigens,log_Neoantigens_clonal,log_Neoantigens_subclonal,log_Subclone_count,log_Aging_Signature,log_Smoking_Signature,log_APOBEC_Signature,HLA_LOH_present,HLA_hom_present,B2M_altered,log_DNA_BCR_burden,log_DNA_TCR_burden,DNA_BCR_entropy,DNA_TCR_entropy,Ploidy,Purity,Tumor_cluster,T1_strength,T2_strength,T3_strength,T4_strength,T1_norm,T2_norm,T3_norm,T4_norm,Integrative_cluster,I1_strength,I2_strength,I3_strength,I1_norm,I2_norm,I3_norm,Adenosine (Corvus)_HM,Antigen processing machinery (PMID: 27855702)_HM,EMT2 (PMID: 27321955)_HM,IFNG_HM,Merck/Nanostring 18 gene T cell–inflamed GEP score_HM,NFAT/NR4A1 family T cell dysfunction_HM,TGF-B (Mariathasan Nature 2018)_HM,B-cells_DH,CD45_DH,CD8 T cells_DH,Cytotoxic cells_DH,DC_DH,Exhausted CD8_DH,Macrophages_DH,Mast cells_DH,Neutrophils_DH,NK CD56dim cells_DH,NK cells_DH,T-cells_DH,Th1 cells_DH,Treg_DH,B-cells_SF,Cytotoxic cells_SF,DC_SF,Exhausted CD8_SF,Exhausted/HS CD8_SF,Lymphocytes_SF,Lymphocytes exhausted/cell cycle_SF,Macrophages/Monocytes_SF,Memory T cells_SF,Plasma_SF,Treg_SF,hMø1_ZI,hMø4_ZI,hMø5_ZI,hMø6_ZI,hMø7_ZI,hMø8_ZI,hMø9_ZI,hMono1_ZI,hMono2_ZI,hMono3_ZI,hN1_ZI,hN2_ZI,hN3_ZI,hN5_ZI,hDC1_ZI,hDC2_ZI,hDC3_ZI,hpDC_ZI,hMast1_ZI,hMast2_ZI,hT1_ZI,hB_ZI
0,1.0,,1.0,,,,Cleveland Clinic,SU2CLC-CLE-NIVO1,SU2CLC-CLE-NIVO1-T1,SU2CLC-CLE-NIVO1-N1,,,61,F,0.0,1.0,40.0,Squamous,,3.0,A,,,,,,-39.0,2.0,Nivolumab,PD(L)1,1.0,0.0,1.0,53.0,PD,,1.0,434.0,0.0,PD,1.741130,14.257556,39.0,3.713572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000,1.102,0.062,0.062,0.062,-0.499,-0.499,1.483,1.483,1.483,0.303,0.303,0.303,-0.516,-0.024,0.060,0.060,0.060,-0.392,0.041,0.041,0.041,3.657,2.330,-0.071,-1.119,0.076,-0.530,3.657,0.022,0.039,0.039,0.039,-0.540,0.383,0.015,0.052,-0.245,-0.152,-0.169,-0.169,0.422,0.611,-0.065,0.159,1.307,0.628,1.102400,0.062280,1.483000,1.483000,0.302680,-0.024435,0.060325,-0.392410,-0.392410,0.040848,3.656900,-0.070848,3.484100,-0.515530,3.656900,0.039399,0.039399,0.039399,-0.540090,0.678760,0.421620,-0.064586,0.158530,0.435950,0.000000,0.000000,0.062280,0.004463,-0.499330,-0.319080,0.302680,-0.515950,-0.024435,-0.024435,0.060325,0.060325,-0.392410,-0.537220,-0.537220,-1.118700,-0.514420,-0.514420,-0.218200,0.075966,-0.530320,-0.341060,-0.565330,0.258290,0.021668,0.039399,0.039399,-0.540090,0.383090,0.383090,0.014847,-0.001968,-0.001968,-0.199220,-0.151890,0.611190,-0.064586,-0.064586,-0.064586,-0.038524,0.15853,-0.280340,0.627900,0.627900,0.000,0.000,0.000,0.000,-0.499,-0.322,-0.516,-0.516,0.000,0.000,0.000,0.000,-0.392,0.000,0.000,0.000,-1.061,-1.061,0.00,0.00,0.258,0.258,0.000,0.000,-0.540,0.383,0.000,0.000,0.000,-0.152,-0.152,0.611,0.611,0.000,0.000,0.000,0.159,-0.280,0.628,17.0,17.0,1006.0,217.0,713.0,3.0,154.0,25.0,116.0,4.0,103.0,159.0,738.0,1.484516,2.338456,2.853698,0.602060,2.190332,1.414973,2.068186,0.698970,2.017033,2.204120,2.868644,0.0,0.0,0.0,0.006969,0.012692,1.33,0.76,1.96,0.57,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,1.0,,1.0,Cleveland Clinic,SU2CLC-CLE-NIVO10,,,SU2CLC-CLE-NIVO10-T1,Flag,55,F,2.0,1.0,30.0,Adeno,,4.0,,,,,,,-321.0,3.0,Nivolumab,PD(L)1,1.0,0.0,1.0,63.0,PD,,1.0,86.0,0.0,PD,2.069645,2.825230,321.0,3.433987,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.154,0.030,0.088,0.000,0.567,0.110,0.323,0.000,3.0,0.107,0.025,0.118,0.428,0.101,0.470,-0.347220,-1.781804,-1.511646,-2.566604,-1.651118,-1.074415,-1.277529,-1.206640,-1.168817,-0.312651,-1.818944,-1.751334,-2.229064,-0.770359,-1.133422,-0.629028,-1.843729,-1.627241,-1.332082,-1.610568,-2.223001,-1.259768,-0.842559,-0.433253,-1.422601,-1.774418,-1.350288,-2.821135,-0.297431,-2.075795,-1.973561,-2.756701,-1.109117,-1.156333,-2.190216,-1.200923,-1.991091,-0.699990,-0.672497,-2.092137,-2.369909,-0.657237,0.661147,-2.145220,-1.628139,-0.734098,-1.319504,-1.623890,-3.014243,-1.367851,-3.070258,-1.403527,-2.090289,-1.497674
2,1.0,,1.0,1.0,,1.0,Cleveland Clinic,SU2CLC-CLE-NIVO18,SU2CLC-CLE-NIVO18-T1,SU2CLC-CLE-NIVO18-N1,SU2CLC-CLE-NIVO18-T1,Keep,68,F,0.0,0.0,0.0,Adeno,,2.0,A,,,,EGFR,,-533.0,4.0,Nivolumab,PD(L)1,1.0,1.0,1.0,50.0,PD,,1.0,161.0,0.0,PD,1.642576,5.289093,533.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116,0.116,-0.042,-0.042,-0.042,-0.181,-0.181,-0.181,-0.181,-0.181,0.000,0.000,0.000,0.000,0.646,0.488,0.488,0.488,0.661,-0.088,-0.088,-0.088,-0.342,-0.091,0.745,-0.135,-0.130,-0.130,3.657,0.557,0.557,0.242,3.657,-0.114,-0.207,-0.095,-0.095,0.118,0.244,0.244,0.244,0.244,0.116,0.437,1.485,1.465,-0.019,0.116220,-0.041975,-0.181390,-0.181390,0.000000,0.646490,0.487670,0.661270,0.661270,-0.087793,-0.341770,0.745020,-0.135280,0.088148,3.656900,0.556850,0.242370,3.656900,-0.114360,3.656900,0.244260,0.437340,1.484800,1.178600,0.116220,0.116220,-0.041975,-0.041975,-0.181390,0.000000,0.000000,0.000000,0.646490,0.646490,0.507170,-0.136960,0.661270,-0.341770,-0.341770,-0.135280,-0.135280,-0.135280,-0.135280,-0.129940,-0.129940,-0.129940,0.088148,-0.141480,0.556850,0.556850,0.138690,-0.114360,-0.082246,-0.207160,-0.194130,0.117750,0.082629,0.244260,0.244260,0.115950,0.437340,0.437340,0.437340,1.529800,1.46530,0.633740,-0.018694,-0.018694,0.116,0.116,0.000,0.000,-0.181,-0.181,0.000,0.000,0.646,0.646,0.351,-0.137,0.661,0.000,0.000,0.000,-0.135,-0.135,-0.13,-0.13,0.372,0.372,0.268,0.268,-0.114,-0.207,0.000,0.000,0.000,0.244,0.244,0.116,0.116,0.437,0.437,1.485,1.485,0.634,0.000,20.0,15.0,149.0,133.0,0.0,5.0,38.0,35.0,0.0,0.0,50.0,38.0,18.0,0.657577,2.127105,0.000000,0.778151,1.591065,1.556303,0.000000,0.000000,1.707570,1.591065,1.278754,0.0,0.0,0.0,0.026353,0.034150,1.67,1.89,5.22,0.25,2.0,0.000,0.375,0.000,0.088,0.000,0.810,0.000,0.190,2.0,0.020,0.483,0.000,0.040,0.960,0.000,-0.408656,-0.178822,-0.665314,0.322697,-0.179835,-0.578566,-0.553323,0.219801,0.088713,-0.246769,-0.745221,0.641015,-0.402675,-0.134936,-0.407499,-1.166153,-0.320157,-0.575348,0.312373,-0.036421,0.985207,0.710736,0.063517,0.001486,0.039338,-0.141059,0.250194,0.147430,-0.288875,0.147272,1.199513,0.686839,0.789262,-0.241140,-0.269748,0.133515,-0.600769,-0.509239,-0.120329,-0.341126,0.238928,-0.473816,-1.713392,0.527368,-0.722902,-0.381433,-0.283341,-1.638975,-0.397212,-0.145566,0.997473,-0.732677,-0.159027,-0.176729
3,1.0,,1.0,1.0,,1.0,Cleveland Clinic,SU2CLC-CLE-NIVO19,SU2CLC-CLE-NIVO19-T1,SU2CLC-CLE-NIVO19-N1,SU2CLC-CLE-NIVO19-T1,Keep,57,F,2.0,2.0,15.0,Adeno,,4.0,,,,,,,-35.0,1.0,Nivolumab,PD(L)1,0.0,0.0,1.0,297.0,PR,,1.0,297.0,1.0,CR/PR,9.756899,9.756899,35.0,2.772589,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.039,0.378,-0.020,-0.020,-0.020,-0.031,-0.031,-0.031,-0.031,-0.031,-0.162,-0.162,-0.162,-0.162,0.169,0.130,-0.122,-0.122,0.351,0.351,0.351,0.580,0.269,0.269,0.494,-0.074,0.036,0.036,0.752,0.044,0.044,0.044,0.044,-0.437,0.142,0.002,0.002,0.040,0.097,0.097,0.097,0.097,-0.092,0.080,-0.277,0.133,0.089,0.378210,-0.020453,-0.031149,-0.031149,-0.161520,0.168510,0.130160,0.351330,0.351330,0.351330,0.268890,0.493940,-0.074444,0.000000,0.752350,0.043510,0.043510,0.043510,-0.273010,-0.056310,0.096980,0.079868,-0.277210,0.132980,0.158660,-0.039221,-0.020453,-0.020453,-0.031149,0.174020,-0.161520,-0.161520,-0.104930,-0.104930,-0.055515,-0.404420,0.351330,-0.406440,-0.406440,-0.074444,-0.074444,-0.074444,0.202210,0.036173,0.036173,0.036173,0.000000,0.752350,0.043510,0.043510,0.043510,-0.437390,-0.056310,-0.056310,0.002389,0.039680,0.039680,0.096980,0.096980,-0.092204,0.079868,0.079868,0.079868,-0.434540,-0.12557,-0.064216,0.089011,0.089011,0.000,0.378,0.000,0.000,0.000,0.000,-0.162,-0.162,-0.105,-0.105,0.000,0.000,0.351,0.351,0.269,0.269,0.000,0.000,0.00,0.00,0.000,0.752,0.000,0.000,-0.273,0.000,0.000,0.000,0.000,0.000,0.000,-0.452,0.000,0.000,0.000,-0.435,-0.435,0.000,0.000,10.0,10.0,602.0,535.0,15.0,10.0,125.0,110.0,8.0,1.0,46.0,480.0,49.0,1.261803,2.729165,1.204120,1.041393,2.100371,2.045323,0.954243,0.301030,1.672098,2.682145,1.698970,0.0,0.0,0.0,0.011776,0.020402,1.49,1.91,2.10,0.37,1.0,0.489,0.372,0.132,0.004,0.490,0.374,0.133,0.004,2.0,0.000,0.795,0.000,0.000,1.000,0.000,0.670662,0.791995,0.439311,1.339746,0.789172,0.343941,0.595709,0.483138,-0.176360,0.259516,0.160674,0.756281,0.478177,-0.097389,-0.375951,-0.558983,0.786460,-0.463227,0.048374,0.216316,0.538209,0.525296,0.338409,0.408307,0.465818,0.409422,-0.066813,0.698741,-0.443927,-0.243725,0.313025,0.444627,0.766106,1.190017,-0.606071,0.155629,-0.065903,-0.128578,1.258769,-0.274179,-0.066995,0.329074,-0.109088,2.042201,0.414281,-0.651719,-0.013068,0.592631,0.504079,0.551709,-0.312533,0.795462,0.512162,0.489272
4,,,,1.0,,1.0,Cleveland Clinic,SU2CLC-CLE-NIVO2,,,SU2CLC-CLE-NIVO2-T1,Keep,63,F,0.0,0.0,0.0,Adeno,,4.0,,,,,,,-262.0,4.0,Nivolumab,PD(L)1,1.0,1.0,1.0,68.0,PD,,1.0,123.0,0.0,PD,2.233903,4.040736,262.0,0.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.000,0.183,0.317,0.505,0.000,0.182,0.316,0.503,3.0,0.006,0.027,0.139,0.036,0.156,0.808,-0.675524,1.059913,0.732698,0.712259,1.031519,1.181614,0.169688,0.206836,0.146816,1.047910,0.444440,0.236922,0.788992,0.614575,-1.097114,0.161120,0.551745,0.556051,0.540561,0.858795,0.002414,0.768654,0.997207,-0.068318,0.931905,0.860628,1.111493,0.529933,-0.186005,0.663986,0.181901,0.321043,-0.930557,0.196371,1.801350,-0.306568,-0.883556,2.732102,1.364896,-0.042298,-0.055184,-0.387409,0.356693,0.243782,0.802809,-0.332503,0.312384,0.651996,0.823568,1.290402,0.151119,1.524649,1.160909,0.155196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,1.0,,1.0,,,,Yale University,SU2CLC-YAL-2523,SU2CLC-YAL-2523-T1,SU2CLC-YAL-2523-N1,,,63,M,0.0,1.0,40.0,Squamous,,3.0,A,0.00,,22C3,,Illumina,-545.0,3.0,Atezolizumab,PD(L)1,1.0,0.0,1.0,36.0,PD,0.43,1.0,51.0,0.0,PD,1.182654,1.675427,545.0,3.713572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.499,0.363,0.859,0.859,0.400,-0.491,-0.638,1.309,1.309,1.309,0.418,0.418,0.418,-0.454,2.473,0.000,-0.536,-0.536,0.396,1.144,-0.428,-0.491,-0.055,-0.055,0.368,-0.423,-0.085,0.000,0.301,0.384,1.630,0.314,0.314,0.001,-0.050,-0.454,-0.373,-0.245,0.379,0.379,0.379,0.423,-0.483,-0.107,0.387,0.329,-0.105,0.363180,0.859080,1.309200,1.309200,0.418300,2.472800,0.000101,0.315000,0.395570,1.144300,-0.054842,0.367540,2.068200,-0.427340,0.300730,1.629900,0.313510,0.313510,0.001034,0.023263,0.422540,-0.107370,0.387250,-0.050835,-0.440330,-0.357420,0.421610,0.346760,-0.637610,-0.223640,2.322700,-0.454480,-0.997190,-0.520500,-1.292900,-0.482220,0.395570,-0.054842,-0.054842,-0.422660,0.054511,0.048242,0.435510,-0.084937,0.000000,-0.084165,-1.292900,-0.247950,-0.343750,0.412690,0.313510,0.001034,0.009124,-0.050284,-1.292900,-0.039661,-0.039661,0.378540,0.378540,-0.505000,-0.252910,-0.107370,-0.080948,0.387250,-1.29290,0.341450,-0.104860,-0.104860,-0.463,0.504,0.422,0.422,1.303,1.303,0.000,0.000,0.171,-0.451,-1.293,-1.293,0.396,0.396,0.000,0.368,0.340,0.340,0.00,0.00,-0.380,0.000,0.314,0.314,0.000,0.000,-0.427,0.000,0.000,0.379,0.379,-0.452,-0.452,-0.107,-0.107,0.387,0.000,0.341,-0.105,23.0,20.0,711.0,455.0,240.0,18.0,79.0,49.0,29.0,1.0,39.0,479.0,176.0,1.333966,2.658965,2.382017,1.278754,1.903090,1.698970,1.477121,0.301030,1.602060,2.681241,2.247973,0.0,0.0,0.0,0.051362,0.085596,2.13,3.67,3.23,0.59,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
389,1.0,,1.0,,,,Yale University,SU2CLC-YAL-2525,SU2CLC-YAL-2525-T1,SU2CLC-YAL-2525-N1,,,40,F,0.0,1.0,40.0,Adeno,,3.0,A,,,22C3,KRAS,Illumina,-1427.0,6.0,Nivolumab,PD(L)1,1.0,1.0,1.0,93.0,PD,,1.0,206.0,0.0,PD,3.055191,6.767411,1427.0,3.713572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.035,0.465,0.008,0.008,0.353,-0.360,-0.360,0.422,0.422,0.422,-0.012,-0.012,-0.012,-0.012,2.613,-0.092,-0.035,-0.035,1.040,0.983,0.850,0.722,1.786,1.732,1.852,0.001,-0.003,-0.003,0.852,0.177,0.008,-0.030,-0.074,0.118,-0.405,-0.367,-0.351,0.422,-0.420,0.374,0.374,0.374,0.008,-0.001,0.603,0.427,-0.301,0.464590,0.008197,0.421530,0.421530,-0.011749,2.612700,-0.092095,1.557600,1.040400,0.982710,1.785700,1.852500,0.000794,-0.054825,0.851760,0.008099,-0.030136,-0.074246,0.117700,2.650900,0.374320,-0.000987,0.603370,0.386720,-1.292900,0.013320,0.008197,-0.033220,-0.359960,-0.323100,-0.011749,-0.011749,-0.057092,-0.043784,-1.292900,-0.034918,1.610800,-0.115790,-0.759720,0.000794,0.041843,-0.020513,0.010569,-0.002912,-0.002912,-0.002912,-1.213700,-0.371500,0.017357,-0.019135,-0.074246,0.117700,-0.383290,-0.404500,-1.292900,0.421570,0.421570,-0.393020,-0.411780,0.007820,-0.299150,-0.000987,-0.000987,-0.371380,-0.62763,-0.390730,-0.114880,-1.292900,0.000,0.711,0.000,0.000,-0.343,-0.034,0.000,0.000,2.613,0.000,0.000,0.000,0.880,0.880,0.000,1.776,0.000,0.000,0.00,0.00,0.000,0.000,0.000,0.000,0.118,-0.405,-0.367,0.422,0.422,-0.356,0.018,0.000,0.000,0.000,0.000,0.029,0.400,-0.391,0.000,22.0,19.0,576.0,454.0,105.0,8.0,121.0,92.0,23.0,1.0,42.0,396.0,25.0,1.242662,2.658011,2.025306,0.954243,2.086360,1.968483,1.380211,0.301030,1.633468,2.598791,1.414973,0.0,1.0,0.0,0.011574,0.052419,2.08,3.35,3.52,0.48,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
390,1.0,,1.0,,,,Yale University,SU2CLC-YAL-2532,SU2CLC-YAL-2532-T1,SU2CLC-YAL-2532-N1,,,75,M,0.0,1.0,25.0,Other,NSCLC NOS poorly differentiated,4.0,,0.90,,22C3,,Illumina,-19.0,1.0,Nivolumab + Ipilimumab,PD(L)1 + CTLA4,0.0,0.0,0.0,1305.0,PR,-0.80,0.0,1438.0,1.0,CR/PR,42.871222,47.240473,19.0,3.258097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.022,-0.082,0.026,0.026,0.026,0.004,0.004,0.004,0.004,0.004,0.037,0.037,0.037,0.037,-0.014,0.020,0.020,0.020,0.426,0.426,0.246,0.246,-0.283,0.155,0.119,-0.029,-0.438,-0.015,-0.046,0.273,0.273,-0.019,-0.019,-0.021,-0.010,-0.107,-0.107,-0.143,-0.049,-0.049,-0.049,-0.049,0.018,-0.176,-0.054,-0.075,-0.078,-0.082343,0.026133,0.004388,0.004388,0.037238,-0.014198,-0.087166,0.392910,0.426020,0.426020,-0.283170,0.119000,-0.029492,-0.012031,-0.045722,0.273060,-0.019099,-0.019099,-0.020656,-0.009630,-0.048981,-0.175690,-0.054332,-0.074526,0.087009,0.014220,0.026133,-0.047740,0.004388,0.037238,0.037238,0.037238,0.018911,0.018911,-1.292900,0.020027,0.426020,-0.283170,-0.283170,-0.029492,-0.029492,0.000000,-0.137620,-0.438150,-0.015318,-0.053158,-1.287100,-0.045722,0.273060,0.273060,-0.019099,-0.020656,-0.009630,-0.009630,-1.292900,-0.142680,-0.142680,-0.143710,-0.143710,0.018094,-0.420070,-0.175690,-0.175690,-0.054332,-1.29290,-0.136330,-0.078053,-0.191560,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.426,0.426,0.119,0.119,0.000,0.000,0.00,0.00,0.000,0.000,0.273,0.000,0.000,0.000,-0.107,-0.143,-0.143,0.000,0.000,0.000,0.000,-0.176,-0.176,0.000,0.000,-0.136,0.000,5.0,17.0,780.0,703.0,52.0,8.0,169.0,146.0,18.0,1.0,58.0,449.0,216.0,1.374137,2.847573,1.724276,0.954243,2.230449,2.167317,1.278754,0.301030,1.770852,2.653213,2.336460,0.0,0.0,0.0,0.000000,0.176959,0.00,3.96,2.07,0.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
391,1.0,,1.0,,,,Yale University,SU2CLC-YAL-2537,SU2CLC-YAL-2537-T1,SU2CLC-YAL-2537-N1,,,56,F,0.0,1.0,12.0,Adeno,,4.0,,,,22C3,KRAS,Illumina,-937.0,4.0,Pembrolizumab,PD(L)1,1.0,0.0,1.0,60.0,PD,0.27,1.0,116.0,0.0,PD,1.971091,3.810775,937.0,2.564949,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.029,0.284,0.279,0.279,0.279,0.015,0.015,0.009,0.009,0.009,0.017,0.017,0.017,0.017,0.005,-0.221,0.496,0.496,0.008,0.008,0.008,-0.018,0.267,0.267,0.267,-0.012,-0.000,-0.000,-0.022,-0.004,0.000,0.222,0.222,-0.001,-0.007,0.007,0.007,-0.005,-0.042,-0.042,-0.042,-0.042,0.003,-0.071,0.015,-0.250,-0.033,0.284290,0.278690,0.008917,0.008917,0.017340,0.004573,-0.323960,-0.022876,0.007820,0.007820,0.266620,0.266620,-0.011685,-0.021615,-0.021615,0.000000,0.221610,0.221610,-0.000705,-0.011577,-0.042047,-0.071320,0.015282,0.968810,-1.292900,0.010665,0.278690,0.278690,0.015433,-0.297400,0.017340,0.017340,0.004573,-0.009241,-1.292900,-0.224260,0.007820,0.266620,0.266620,-0.011685,-0.011685,-0.011685,-0.011685,-0.000032,-0.000032,-0.000032,-0.021615,-0.021615,-0.132610,0.000000,0.221610,-0.000705,-0.011577,-1.292900,-1.292900,-0.004943,-0.004943,-0.042047,-0.042047,0.002879,-0.338520,-0.071320,-0.071320,-0.118490,-0.34282,-0.289990,-0.033154,-0.057959,0.000,0.000,0.279,0.279,0.000,0.000,0.000,0.000,0.000,0.000,0.000,-0.224,0.000,0.000,0.267,0.267,0.000,0.000,0.00,0.00,0.000,0.000,0.000,0.166,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,8.0,11.0,291.0,197.0,69.0,5.0,98.0,59.0,30.0,1.0,3.0,272.0,0.0,0.946869,2.296665,1.845098,0.778151,1.995635,1.778151,1.491362,0.301030,0.602060,2.436163,0.000000,0.0,1.0,0.0,0.001617,0.050430,0.00,3.20,2.16,0.28,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
## List of features
print("Number of features: ", len(su2c_merge_master.columns.to_list()))
print(su2c_merge_master.columns.to_list())

## Save the list of features
with open("/content/drive/MyDrive/Tesi/Code/Personal_Code/Saved/Features/Merged_Features.txt", "w") as output:
    output.write(str(su2c_merge_master.columns.to_list()))

## Save the whole dataset
with open("/content/drive/MyDrive/Tesi/Code/Personal_Code/Saved/Data/su2c_merge_master.csv", "w") as output:
    output.write(str(su2c_merge_master.to_csv()))

Number of features:  353
['WES_Cohort_1', 'WES_Cohort_2', 'WES_All', 'RNA_Cohort_1', 'RNA_Cohort_2', 'RNA_All', 'Institution', 'Harmonized_SU2C_Participant_ID_v2', 'Harmonized_SU2C_WES_Tumor_Sample_ID_v2', 'Harmonized_SU2C_WES_Normal_Sample_ID_v2', 'Harmonized_SU2C_RNA_Tumor_Sample_ID_v2', 'Pre-treatment_RNA_Sample_QC', 'Patient_Age_at_Diagnosis', 'Patient_Sex', 'Patient_Race', 'Patient_Smoking_Status', 'Patient_Smoking_Pack_Years_Harmonized', 'Histology_Harmonized', 'Histology_Detail', 'Initial_Stage', 'Initial_Stage_Substage', 'PDL1_TPS', 'PDL1_TPS_Description', 'Local_Antibody_Clone', 'Clinical_Driver', 'Sequencing_Platform', 'Advanced_Diagnosis_Date', 'Line_of_Therapy', 'Agent_PD1', 'Agent_PD1_Category', 'Prior_Platinum', 'Prior_TKI', 'Harmonized_PFS_Event', 'Harmonized_PFS_Days', 'Harmonized_Confirmed_BOR', 'Harmonized_BOR_RECIST', 'Harmonized_OS_Event', 'Harmonized_OS_Days', 'Harmonized_Confirmed_BOR_Bin', 'Harmonized_Confirmed_BOR_3_Cat', 'Harmonized_PFS_Months', 'Harmonized_OS_

#### Read TCGA Data

In [None]:
## Load Data
tcga_clinical_harm = pd.read_csv(source_path_data_tcga + "TCGA_Clinical_Harmonized.txt", sep = "\t")
## Check
print("Shape is: ", tcga_clinical_harm.shape)
display(tcga_clinical_harm.head())

Shape is:  (1097, 110)


Unnamed: 0,Tumor_Sample_Barcode,Study ID,Patient ID,Sample ID,Diagnosis Age,Neoplasm Disease Stage American Joint Committee on Cancer Code.1,Neoplasm American Joint Committee on Cancer Clinical Distant Metastasis M Stage,American Joint Committee on Cancer Metastasis Stage Code,American Joint Committee on Cancer Lymph Node Stage Code.1,American Joint Committee on Cancer Lymph Node Stage Code,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code,Neoplasm Disease Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Publication Version Type,American Joint Committee on Cancer Tumor Stage Code,ALK Analysis Type,ALK Translocation Status,ALK Translocation Variant,Cancer Type,Cancer Type Detailed,Carbon monoxide diffusion dlco,Days to Sample Collection.,Last Alive Less Initial Pathologic Diagnosis Date Calculated Day Value,days_to_patient_progression_free,Days to Sample Procurement,days_to_tumor_progression,Disease Free (Months),Disease Free Status,Disease code,Performance Status,Ethnicity Category,Lymphomatous Extranodal Site Involvement Indicator,Fev1 fvc ratio postbroncholiator,Fev1 fvc ratio prebroncholiator,Fev1 percent ref postbroncholiator,Fev1 percent ref prebroncholiator,Form completion date,Fraction Genome Altered,Neoplasm Histologic Type Name,Prior immunologic disease type,Prior immunologic disease other,Neoadjuvant Therapy Type Administered Prior To Resection Text,Prior Cancer Diagnosis Occurence,Prior infectious disease,HIV Status,ICD-10 Classification,"International Classification of Diseases for Oncology, Third Edition ICD-O-3 Histology Code","International Classification of Diseases for Oncology, Third Edition ICD-O-3 Site Code",Informed consent verified,Year Cancer Initial Diagnosis,Is FFPE,Karnofsky Performance Score,Kras gene analysis indicator,KRAS Mutation,Kras mutation identified type,Primary Tumor Laterality,Location lung parenchyma,Longest Dimension,First Pathologic Diagnosis Biospecimen Acquisition Method Type,First Pathologic Diagnosis Biospecimen Acquisition Other Method Type,First Pathologic Diagnosis Biospecimen Acquisition Method Type.1,Mutation Count,Mutation Status,Mutation Type,New Neoplasm Event Post Initial Therapy Indicator,Number of lymphnodes positive by ihc,Oct embedded,Oncotree Code,Overall Survival (Months),Overall Survival Status,Specimen Collection Method,Other Patient ID,Other Sample ID,Pathology Report File Name,Pathology report uuid,Performance Status Assessment Timepoint Category,Adjuvant Postoperative Pharmaceutical Therapy Administered Indicator,Lymph node location positive pathology name,Primary Other Site of Disease Name,Patient Primary Tumor Site,Project code,Tissue Prospective Collection Indicator,Pulmonary function test indicator,Race Category,Did patient start adjuvant postoperative radiotherapy?,Surgical Margin Resection Status,Tissue Retrospective Collection Indicator,Number of Samples Per Patient,Sample Initial Weight,Sample Type,Sample type id,Sex,Shortest Dimension,Person Cigarette Smoking History Pack Year Value,Started Smoking Year,Stopped Smoking Year,Somatic Status,Specimen Current Weight,Specimen Freezing Means,Specimen Second Longest Dimension,Stage Other,Adjuvant Postoperative Targeted Therapy Administered Indicator,Time between clamping and freezing,Time between excision and freezing,Tissue Source Site,Patient Smoking History Category,Primary Therapy Outcome Success Type,Tumor Site,Person Neoplasm Status,Vial number,Tumor Tissue Site
0,TCGA-05-4244-01,luad_tcga,TCGA-05-4244,TCGA-05-4244-01,70.0,,,M1,,,N2,Stage IV,6th,T2,,,,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.0,,,,0.0,0:DiseaseFree,,,,,,,,,7/22/10,0.4565,Lung Adenocarcinoma,,,No,No,,,C34.3,8140/3,C34.3,YES,2009.0,NO,,,,,,Peripheral Lung,1.1,,,,,,,,,,LUAD,0.0,0:LIVING,,34040b83-7e8a-4264-a551-b16621843e28,bac0b02d-ac3b-4784-b8bf-180eadd548a3,TCGA-05-4244.3a844132-f813-4d8e-8f7d-dbae0b23d...,3a844132-f813-4d8e-8f7d-dbae0b23d7fd,,,,,R-Lower,,NO,,,,RX,YES,1,,Primary,1,MALE,0.3,38.0,,,Matched,,,0.9,,,,,5,4.0,,Lung,TUMOR FREE,A,
1,TCGA-05-4245-01,luad_tcga,TCGA-05-4245,TCGA-05-4245-01,81.0,,,M0,,,N2,Stage IIIA,6th,T2,,,,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.0,,,,10.97,1:Recurred/Progressed,,,,,,,,,7/22/10,,Lung Adenocarcinoma,,,No,Yes,,,C34.1,8140/3,C34.1,YES,2009.0,NO,,,,,,,0.9,,,,,,,,,,LUAD,23.98,0:LIVING,,03d09c05-49ab-4ba6-a8d7-e7ccf71fafd2,c9dd827b-e0ae-4807-adbf-1a0bf2aeb52f,TCGA-05-4245.902fe548-5b93-49c9-81db-2af4a4a88...,902fe548-5b93-49c9-81db-2af4a4a88f3c,,,,,L-Upper,,NO,,,,R2,YES,1,,Primary,1,Male,0.5,32.0,,,Matched,,,0.7,,,,,5,4.0,,Lung,TUMOR FREE,A,
2,TCGA-05-4249-01,luad_tcga,TCGA-05-4249,TCGA-05-4249-01,67.0,,,M0,,,N0,Stage IB,6th,T2,,,,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.0,,,,50.03,0:DiseaseFree,,,,,,,,,7/22/10,0.2221,Lung Adenocarcinoma,,,No,No,,,C34.3,8140/3,C34.3,YES,2007.0,NO,,,,,,Peripheral Lung,1.4,,,,288.0,,,,,,LUAD,50.03,0:LIVING,,4addf05f-3668-4b3f-a17f-c0227329ca52,80f196fe-1eaf-40cb-ab58-c84795acc5c7,TCGA-05-4249.7e920317-d5c2-4160-9e2b-ef0101eb5...,7e920317-d5c2-4160-9e2b-ef0101eb5a23,,,,,R-Lower,,NO,,,,R0,YES,1,,Primary,1,Male,0.4,52.0,,,Matched,,,0.8,,,,,5,3.0,,Lung,TUMOR FREE,A,
3,TCGA-05-4250-01,luad_tcga,TCGA-05-4250,TCGA-05-4250-01,79.0,,,M0,,,N1,Stage IIIA,6th,T3,,,,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.0,,,,,,,,,,,,,,7/22/10,0.2362,Lung Adenocarcinoma,,,No,No,,,C34.3,8140/3,C34.3,YES,2007.0,NO,,,,,,,1.2,,,,,,,,,,LUAD,3.98,1:DECEASED,,f98ecd8a-b878-4f53-b911-20cd8e17281c,8f274178-7a8e-46b6-8d2c-900338bbb946,TCGA-05-4250.5574f2f8-f247-40e6-a285-7793edcf5...,5574f2f8-f247-40e6-a285-7793edcf5358,,,,,R-Lower,,NO,,,,R2,YES,1,,Primary,1,Female,0.3,47.0,,,Matched,,,0.7,,,,,5,4.0,,Lung,,A,
4,TCGA-05-4382-01,luad_tcga,TCGA-05-4382,TCGA-05-4382-01,68.0,,,M0,,,N0,Stage IB,6th,T2,,,,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.0,,,,10.97,1:Recurred/Progressed,,,,,,,,,7/22/10,0.0854,Lung Adenocarcinoma,,,No,Yes,,,C34.1,8255/3,C34.1,YES,2009.0,NO,,,,,,,0.9,,,,1273.0,,,,,,LUAD,19.94,0:LIVING,,3434b91a-c05f-460f-a078-7b1bb6e7085d,cce6d71f-369e-467f-bd7e-03d20e97b7f3,TCGA-05-4382.952c0f32-1472-49e1-8334-b0f1de4ac...,952c0f32-1472-49e1-8334-b0f1de4ac921,,,,,L-Upper,,NO,,,,R0,YES,1,,Primary,1,Male,0.3,62.0,,,Matched,,,0.8,,,,,5,4.0,,Lung,TUMOR FREE,A,


In [None]:
## Load Data
tcga_clinical_hist = pd.read_csv(source_path_data_tcga + "TCGA_Histology_NMF_Harmonized.txt", sep = "\t")
## Check
print("Shape is: ", tcga_clinical_hist.shape)
display(tcga_clinical_hist.head())

Shape is:  (1082, 10)


Unnamed: 0,Tumor_Sample_Barcode,B_cluster,B1_strength,B2_strength,B3_strength,B4_strength,B1_norm,B2_norm,B3_norm,B4_norm
0,TCGA-05-4244-01,2,0.1009016,0.662309,5e-324,1e-323,0.1322067,0.867793,5e-324,1.5e-323
1,TCGA-05-4249-01,2,3.9470040000000004e-255,0.793846,5e-324,1.5e-323,4.972003e-255,1.0,5e-324,2e-323
2,TCGA-05-4250-01,1,0.4153831,0.363432,0.0005013239,5e-324,0.5330095,0.466347,0.0006432867,5e-324
3,TCGA-05-4382-01,2,0.1867183,0.570932,1.3279509999999999e-306,1e-323,0.2464439,0.753556,1.752723e-306,1.5e-323
4,TCGA-05-4384-01,2,9.001624e-256,0.798828,5e-324,1e-323,1.1268540000000001e-255,1.0,5e-324,1.5e-323


In [None]:
## Warning: this operation takes a lot of time(~ 2 min)

## Load Data
tcga_rna_log = pd.read_csv(source_path_data_tcga + "TCGA_RNA_Harmonized_log.txt", sep = "\t")
## Check
print("Shape is: ", tcga_rna_log.shape)
display(tcga_rna_log.head())

Output hidden; open in https://colab.research.google.com to view.