# Clinical Data Processing

## Where the data at?

In [3]:
input_path = '../Data/Intermediate_Files/'
clinicaldata_path = '../Data/Raw_Data/Clinical_Data/'
output_path = '../Data/Processed_Data/'

## Load Methyl Data

In [4]:
import pandas as pd

df_methyl = pd.read_pickle(
    input_path+'2_MethylData_Processing_Output.pkl')

print(
    f' Dataset (df) contains {df_methyl.shape[1]} columns (mC sites) and {df_methyl.shape[0]} rows (samples).')


 Dataset (df) contains 333249 columns (mC sites) and 3357 rows (samples).


In [5]:
df_methyl['Batch'].value_counts(dropna=False)

Batch
GSE49031          933
GSE190931         581
GSE124413         495
GSE159907         316
GDC_TARGET-AML    314
GDC_TCGA-AML      194
GSE152710         166
GSE147667         153
GDC_TARGET-ALL    141
GSE133986          64
Name: count, dtype: int64

## Add Labels/Clinical Outcome Data

In [6]:
# Import functions to clean up clinical data
from FM_Functions.Clinical_Data_CleanUp import *

In [29]:
def merge_index_1031(filepath1 = '/TARGET/TARGET-AML/TARGET_AML_ClinicalData_AML1031_20221108.xlsx',
                     filepath2 = '../Data/Raw_Data/Methyl_Array_EPIC/GSE190931/sample_sheet_meta_data.pkl'):

    # Load clinical data files
    labels_1031 = pd.read_excel(clinical_data_path + filepath1)
    meta        = pd.read_pickle(filepath2)

    # Extract last term of `TARGET USI` by splitting on `-`
    labels_1031['Patient_ID'] = labels_1031['TARGET USI'].str.split('-').str[2]

    # extract patient name from `description` column by splitting on `\n` and then on `_`
    meta['Patient_ID'] = meta['description'].str.split('\n').str[-1].str.split('_').str[-2]

    # Set index to `Patient_ID` and selected only relevant columns
    meta = meta.set_index('Patient_ID').iloc[:,:-1][['fusion','timepoint','Sample_ID']]

    # Set index to `Patient_ID`
    labels_1031 = labels_1031.set_index('Patient_ID')

    # Join the two dataframes
    labels_1031 = labels_1031.join(meta, how='right').set_index('Sample_ID')

    return labels_1031

labels_1031 = merge_index_1031()

In [177]:
def merge_index_0531(dir = '../Data/Raw_Data/Clinical_Data/TARGET/TARGET-AML/',
                    filepath1 = 'TARGET_AML_ClinicalData_Discovery_20221108.xlsx',
                    filepath2 = 'TARGET_AML_ClinicalData_Validation_20221108.xlsx',
                    filepath3 = 'TARGET_AML_ClinicalData_LowDepthRNAseq_20221108.xlsx',
                    filepath4 = '../Data/Raw_Data/Methyl_Array_EPIC/GSE124413/sample_sheet_meta_data.pkl',
                    filepath5 = '../Data/Raw_Data/Methyl_Array_EPIC/GSE124413/GSE124413_series_matrix.csv'):
    
    # Load all clinical data files for 0531
    labels_0531_1 = pd.read_excel(dir + filepath1, index_col=0)
    labels_0531_2 = pd.read_excel(dir + filepath2, index_col=0)
    labels_0531_3 = pd.read_excel(dir + filepath3, index_col=0)
    meta          = pd.read_pickle(filepath4)
    meta_matrix   = pd.read_csv(filepath5)

    # Concatenate the two dataframes
    labels_0531 = pd.concat([labels_0531_1, labels_0531_2], axis=0, join='outer').reset_index()

    def remove_the_duplicate_samples_with_more_nulls(df=labels_0531):
        ''' 
        This function removes duplicate samples from the dataframe, keeping the row with fewer NaNs (null values).
        '''    
        # Adding a new column 'nan_count' which is the count of NaNs in each row
        df = df.replace({'NA': np.nan, 'unknown': np.nan, 'Unknown': np.nan})
        df['nan_count'] = df.isnull().sum(axis=1)

        # Sort by 'nan_count' so that rows with fewer NaNs come first
        df = df.sort_values('nan_count')

        # Drop duplicates, keeping the first one (with fewer NaNs)
        df = df.drop_duplicates(subset= 'TARGET USI', keep='first')

        # Remove the 'nan_count' column
        df = df.drop(columns='nan_count')
        return df

    def clean_meta(meta_matrix, meta):
        # Transpose the dataframe and reset index
        transposed_meta = meta_matrix.T.reset_index()

        # Split the index and join to dataframe
        transposed_meta['new_index'] = transposed_meta['index'].str.rsplit(" ", n=1, expand=True)[1]
        
        # Set new header
        transposed_meta.columns = transposed_meta.iloc[0]
        transposed_meta = transposed_meta.drop(transposed_meta.index[0])

        # Rename columns and set index
        transposed_meta = transposed_meta.rename(columns={'!Sample_geo_accession':'GSM_ID', None: 'Patient_ID'}).set_index('GSM_ID')

        # Join with meta DataFrame, select columns and reset index
        meta_cleaned = meta.set_index('GSM_ID').join(transposed_meta)[['Patient_ID', '!Sample_characteristics_ch1','Sample_ID']].reset_index().set_index('Patient_ID')

        # Rename columns
        meta_cleaned.columns = ['GSM_ID', 'Sample Type', 'age','sex', 'Sample_ID']

        return meta_cleaned

    # Adjust and clean metadata
    meta_cleaned = clean_meta(meta_matrix, meta)

    # Remove duplicate samples by keeping the row with fewer NaNs
    labels_0531 = remove_the_duplicate_samples_with_more_nulls()

    # Set index to `TARGET USI` and join with `Gene Fusion.1` column
    labels_0531 = labels_0531.set_index('TARGET USI').join(labels_0531_3[['Gene Fusion.1']], how='outer')

    # extract the last two words of `TARGET USI` by splitting on `-` 
    labels_0531['Tumor Code'] = labels_0531.index.str.split('-').str[1]
    labels_0531['Patient_ID'] = labels_0531.index.str.split('-').str[2]

    # drop duplicates in `Patient_ID` column
    labels_0531 = labels_0531.drop_duplicates(subset='Patient_ID', keep='first').set_index('Patient_ID')

    # join with meta_cleaned dataframe
    labels_0531 = labels_0531.join(meta_cleaned, how='right')

    # drop columns that are not needed
    labels_0531 = labels_0531.drop(columns=['age', 'sex', 'GSM_ID'])

    # Rename values in `Sample Type` column
    labels_0531['Sample Type'] = labels_0531['Sample Type'].replace({'group: tumor': 'Diagnosis',
                                                                    'group: normal': 'Bone Marrow Normal'})

    # Set index to `Sample_ID` to match methylation samples
    labels_0531 = labels_0531.set_index('Sample_ID')

    return labels_0531


In [178]:
labels_0531 = merge_index_0531()

In [179]:
labels_0531

Unnamed: 0_level_0,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,Overall Survival Time in Days,Year of Diagnosis,Year of Last Follow Up,...,Chloroma Site of Relapse/Induction Failure,Cytogenetic Site of Relapse/Induction Failure,Other Site of Relapse/Induction Failure,Gene Fusion,Gemtuzumab ozogamicin treatment,Refractory Timepoint sent for Induction Failure Project,Comment,Gene Fusion.1,Tumor Code,Sample Type
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200989060017_R08C01,Female,White,Not Hispanic or Latino,4155.0,Relapse,611.0,Dead,1377.0,2007.0,2011.0,...,Yes,No,No,DEK-NUP214,NO Gentuzumab ozogamicin treatment,,"Validation,TMA",DEK-NUP214,20,Diagnosis
201005010039_R06C01,Male,White,Not Hispanic or Latino,5859.0,Censored,1763.0,Alive,1763.0,2010.0,2014.0,...,Not Done,Not Done,Not Done,KMT2A-MLLT3,Gentuzumab ozogamicin treatment,,"Validation,TMA",KMT2A-MLLT3,20,Diagnosis
201005010157_R08C01,Male,White,Not Hispanic or Latino,1539.0,Relapse,500.0,Alive,1590.0,2010.0,2014.0,...,No,No,No,,NO Gentuzumab ozogamicin treatment,,"Validation,TMA",none,20,Diagnosis
201005010131_R03C01,Female,White,Hispanic or Latino,4567.0,Relapse,478.0,Dead,599.0,2007.0,2008.0,...,No,No,No,NUP98-HOXA9,NO Gentuzumab ozogamicin treatment,,Validation,NUP98-HOXA9,20,Diagnosis
201005010102_R07C01,Male,White,Not Hispanic or Latino,618.0,Censored,2803.0,Alive,2803.0,2007.0,2014.0,...,Not Done,Not Done,Not Done,RUNX1-CBFA2T2,NO Gentuzumab ozogamicin treatment,,"Discovery,Validation,TMA",RUNX1-CBFA2T2,20,Diagnosis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200989060017_R04C01,Male,Asian,Not Hispanic or Latino,4957.0,Censored,1653.0,Alive,1653.0,2009.0,2013.0,...,Not Done,Not Done,Not Done,CBFB-MYH11,NO Gentuzumab ozogamicin treatment,,"Validation,TMA",CBFB-MYH11,20,Diagnosis
200973410086_R03C01,Female,White,Not Hispanic or Latino,3981.0,Censored,1754.0,Alive,1754.0,2009.0,2014.0,...,Not Done,Not Done,Not Done,,Gentuzumab ozogamicin treatment,,"Validation,TMA",none,20,Diagnosis
200973410095_R04C01,Female,Asian,Not Hispanic or Latino,4269.0,Induction Failure,71.0,Alive,2386.0,2008.0,2014.0,...,No,No,No,,Gentuzumab ozogamicin treatment,,"Validation,TMA",none,20,Diagnosis
200991620024_R03C01,Male,White,Not Hispanic or Latino,4072.0,Relapse,180.0,Dead,418.0,2008.0,2009.0,...,No,No,No,,NO Gentuzumab ozogamicin treatment,,TMA,none,20,Diagnosis


In [147]:
labels[labels['Patient_ID'].duplicated()==True]

Unnamed: 0_level_0,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,Overall Survival Time in Days,Year of Diagnosis,Year of Last Follow Up,...,Chloroma Site of Relapse/Induction Failure,Cytogenetic Site of Relapse/Induction Failure,Other Site of Relapse/Induction Failure,Gene Fusion,Gemtuzumab ozogamicin treatment,Refractory Timepoint sent for Induction Failure Project,Comment,Gene Fusion.1,Patient_ID,Tumor Code
TARGET USI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TARGET-21-PARBTV,Male,White,Not Hispanic or Latino,6273.0,Induction Failure,77.0,Dead,444.0,2007.0,2008.0,...,No,No,No,,NO Gentuzumab ozogamicin treatment,EOI I,This case is also found in the AML validation ...,,PARBTV,21
TARGET-21-PARHRS,Male,White,Not Hispanic or Latino,3636.0,Induction Failure,82.0,Alive,2616.0,2007.0,2014.0,...,No,No,No,,NO Gentuzumab ozogamicin treatment,EOI II,This case is also found in the AML validation ...,,PARHRS,21
TARGET-21-PARLSL,Male,White,Not Hispanic or Latino,4456.0,Censored,1077.0,Alive,1077.0,2007.0,2010.0,...,Not Done,Not Done,Not Done,,Gentuzumab ozogamicin treatment,EOI I,This case is also found in the AML validation ...,,PARLSL,21
TARGET-21-PARNAW,Female,White,Not Hispanic or Latino,430.0,Induction Failure,66.0,Dead,457.0,2007.0,2009.0,...,No,No,No,,NO Gentuzumab ozogamicin treatment,EOI I,This case is also found in the AML validation ...,,PARNAW,21
TARGET-21-PARZIA,Female,Asian,Not Hispanic or Latino,6089.0,Death without Remission,77.0,Dead,77.0,2008.0,2008.0,...,Not Done,Not Done,Not Done,,Gentuzumab ozogamicin treatment,EOI II,This case is also found in the AML validation ...,,PARZIA,21
TARGET-21-PASDXR,Male,White,Not Hispanic or Latino,3440.0,Induction Failure,33.0,Dead,276.0,2008.0,2009.0,...,No,No,No,NUP98-NSD1,Gentuzumab ozogamicin treatment,EOI I,This case is also found in the AML validation ...,,PASDXR,21
TARGET-21-PASFHK,Male,White,Hispanic or Latino,2164.0,Induction Failure,85.0,Alive,1655.0,2008.0,2013.0,...,No,No,No,,NO Gentuzumab ozogamicin treatment,EOI I,"Induction Failure, TMA",none,PASFHK,21
TARGET-21-PASFJJ,Female,White,Not Hispanic or Latino,3836.0,Censored,2205.0,Alive,2205.0,2008.0,2014.0,...,Not Done,Not Done,Not Done,,Gentuzumab ozogamicin treatment,EOI I,This case is also found in the AML validation ...,,PASFJJ,21
TARGET-21-PASFLG,Male,White,Hispanic or Latino,3971.0,Induction Failure,69.0,Dead,694.0,2008.0,2010.0,...,No,No,No,NUP98-NSD1,NO Gentuzumab ozogamicin treatment,EOI I,This case is also found in the AML validation ...,,PASFLG,21
TARGET-21-PASHMK,Male,White,Not Hispanic or Latino,3532.0,Induction Failure,76.0,Alive,1968.0,2008.0,2014.0,...,No,No,No,,,,Validation,,PASHMK,21


In [21]:
df_methyl[df_methyl['Batch'] == 'GSE190931']

IlmnID,Batch,cg00000109,cg00000236,cg00000292,cg00000363,cg00000622,cg00000658,cg00000714,cg00000721,cg00000734,...,ch.9.83519450F,ch.9.837340R,ch.9.84051654F,ch.9.84078312F,ch.9.86947500F,ch.9.88862796F,ch.9.90287778F,ch.9.90621653R,ch.9.93402636R,ch.9.98463211R
202897270043_R01C01,GSE190931,0.935,0.893,0.880,0.361,0.012,0.897,0.360,0.958,0.075,...,0.022,0.039,0.045,0.038,0.044,0.039,0.047,0.032,0.028,0.031
202897270043_R03C01,GSE190931,0.967,0.882,0.885,0.510,0.013,0.868,0.384,0.968,0.070,...,0.021,0.055,0.028,0.045,0.039,0.032,0.023,0.025,0.023,0.025
202897270043_R04C01,GSE190931,0.880,0.938,0.751,0.636,0.015,0.823,0.433,0.921,0.073,...,0.031,0.055,0.032,0.088,0.040,0.056,0.038,0.035,0.031,0.031
202897270043_R05C01,GSE190931,0.874,0.651,0.326,0.133,0.015,0.870,0.107,0.755,0.087,...,0.048,0.295,0.034,0.121,0.046,0.047,0.046,0.034,0.037,0.065
202897270043_R06C01,GSE190931,0.926,0.779,0.707,0.166,0.013,0.867,0.244,0.959,0.076,...,0.025,0.061,0.041,0.030,0.039,0.049,0.029,0.029,0.027,0.036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202915410140_R03C01,GSE190931,0.952,0.860,0.824,0.245,0.015,0.867,0.347,0.965,0.064,...,0.022,0.049,0.030,0.026,0.035,0.033,0.037,0.030,0.026,0.030
202915410140_R04C01,GSE190931,0.929,0.956,0.806,0.609,0.014,0.877,0.116,0.962,0.059,...,0.025,0.054,0.033,0.033,0.037,0.030,0.038,0.025,0.021,0.028
202915410140_R05C01,GSE190931,0.940,0.951,0.769,0.687,0.015,0.862,0.251,0.961,0.066,...,0.024,0.039,0.029,0.029,0.041,0.029,0.036,0.030,0.026,0.029
202915410140_R06C01,GSE190931,0.936,0.836,0.828,0.204,0.013,0.857,0.324,0.967,0.065,...,0.022,0.032,0.031,0.032,0.037,0.026,0.029,0.029,0.024,0.028


In [5]:
import pandas as pd

# Call functions to merge, index and clean clinical data files
labels_aml02        = clean_aml02     (merge_index_aml02())
labels_aml08        = clean_aml08     (merge_index_aml08())
labels_aml05        = clean_aml05     (merge_index_aml05())
labels_cog          = clean_cog       (merge_index_cog())
labels_beataml      = clean_beataml   (merge_index_beataml())
labels_amltcga      = clean_amltcga   (merge_index_amltcga())
labels_nordic_all   = clean_nordic_all(merge_index_nordic_all())
labels_mds_taml     = clean_mds_taml  (merge_index_mds_taml())
labels_all_graal    = clean_all_graal (merge_index_all_graal())
labels_target_all   = clean_target_all(merge_index_target_all())


# Combine all clinical data labels into one dataframe
labels_combined = pd.concat([labels_aml02, labels_aml08, labels_aml05,
                        labels_cog, labels_beataml, labels_amltcga,
                        labels_nordic_all, labels_mds_taml,
                        labels_all_graal,labels_target_all], axis=0, join='outer')


In [6]:

# Remove samples that are not in the methyl dataset
df = labels_combined.loc[labels_combined.index.isin(df_methyl.index)]

# Label control samples from the AML0531 clinical trial (GSE124413) as 'Bone Marrow Normal'
df_ = label_control_samples(df_methyl, df)


In [7]:
df2 = df_methyl[['Batch']].join(df_, how='inner')

In [10]:
labels_amltcga.index

Index(['b93cb62a-a7dc-406d-8482-6b51a92ea3c3_noid',
       'fb4c9803-3690-4f6a-9402-72a4f36d64d1_noid',
       '2fcda6a9-813b-41b2-aae4-ca42c9986287_noid',
       'ada38f3e-8020-4394-9e7c-50d06dd04769_noid',
       'e78ff499-037b-450a-ac04-6fb3a9e124a4_noid',
       '9aedadd8-98e2-4cff-b1a3-64506bf1d95c_noid',
       'eefba73e-bf2c-4cac-bd29-8ab3d392aa5b_noid',
       '7b4ce492-ff40-4bf1-b3e8-75ff83e8746d_noid',
       '865cda60-ec30-4562-b681-0e90737a97ca_noid',
       '7cf64c74-47e6-4913-b91b-25198e36b7c8_noid',
       ...
       '92c7e4d6-eb53-46c5-9721-5edccd60bcc9_noid',
       '66392380-a9fe-45b7-9191-60993f3f77f4_noid',
       '478f29b1-6604-4e75-a492-47c384799f44_noid',
       'e49a1768-bf1e-49c0-bca3-4e076928281c_noid',
       'c9943335-86a8-4286-891c-c2331d75dc0d_noid',
       'd3988699-70d6-43e1-b84b-9e38b4d2d2b1_noid',
       '898a092e-89fe-4010-afea-14c605f99481_noid',
       'c9b92f8f-4599-47d1-9d12-31e42166a091_noid',
       'e08f84fe-0013-4734-9966-cd734e6fedc5_noid',
 

In [9]:
df_methyl.index

Index(['202897270043_R01C01', '202897270043_R03C01', '202897270043_R04C01',
       '202897270043_R05C01', '202897270043_R06C01', '202897270043_R07C01',
       '202897270043_R08C01', '202897270045_R03C01', '202897270045_R04C01',
       '202897270045_R06C01',
       ...
       '201233760140_R05C01', '201233760140_R06C01', '201233760140_R07C01',
       '201233760140_R08C01', '203724130020_R01C01', '203724130020_R02C01',
       '203724130020_R03C01', '203724130020_R04C01', '203724130020_R05C01',
       '203724130020_R06C01'],
      dtype='object', length=3357)

In [8]:
df2.shape

(3136, 2348)

In [11]:
df_methyl['Batch'].value_counts(dropna=False)

Batch
GSE49031          933
GSE190931         581
GSE124413         495
GSE159907         316
GDC_TARGET-AML    314
GDC_TCGA-AML      194
GSE152710         166
GSE147667         153
GDC_TARGET-ALL    141
GSE133986          64
Name: count, dtype: int64

In [12]:
df2['Batch'].value_counts(dropna=False)

Batch
GSE49031          933
GSE190931         581
GSE124413         495
GSE159907         316
GDC_TARGET-AML    287
GSE152710         166
GSE147667         153
GDC_TARGET-ALL    141
GSE133986          64
Name: count, dtype: int64

In [25]:
df_methyl.shape, df_.shape

((3357, 333249), (3136, 2149))

In [20]:
df_methyl.shape[0] - df_.shape[0]

221

In [11]:
df_['Clinical Trial'].value_counts(dropna=False)

Clinical Trial
NOPHO ALL92-2000            933
AAML0531                    628
AAML1031                    581
Beat AML Consortium         316
CETLAM SMD-09 (MDS-tAML)    166
French GRAALL 2003–2005     153
TARGET ALL                  141
AAML03P1                     72
Japanese AML05               64
CCG2961                      41
NaN                          41
Name: count, dtype: int64

In [14]:
labels_mds_taml['Sample Type'].value_counts(dropna=False)    

Sample Type
Diagnosis                  73
3-5M                       24
> or equal to 12M          20
6-11M                      19
<3M                        17
Control (Healthy Donor)    10
Post Transplant             3
Name: count, dtype: int64

In [9]:
df_['Sample Type'].value_counts(dropna=False)

Sample Type
Diagnosis                                            2032
Relapse                                               252
Bone Marrow Normal                                    145
Likely Diagnosis                                      141
Unknown (TARGET-ALL)                                  141
Primary Blood Derived Cancer - Bone Marrow             90
normal bone marrow                                     82
Recurrent Blood Derived Cancer - Bone Marrow           58
normal CD3+ cells                                      25
normal CD19+ cells                                     25
3-5M                                                   24
Primary Blood Derived Cancer - Peripheral Blood        22
> or equal to 12M                                      20
6-11M                                                  19
<3M                                                    17
Healthy Thymus                                         12
Control (Healthy Donor)                                10
Bl

In [16]:
df_methyl['Batch'].value_counts(dropna=False)

Batch
GSE49031          933
GSE190931         581
GSE124413         495
GSE159907         316
GDC_TARGET-AML    314
GDC_TCGA-AML      194
GSE152710         166
GSE147667         153
GDC_TARGET-ALL    141
GSE133986          64
Name: count, dtype: int64

In [15]:
df_['Clinical Trial'].value_counts(dropna=False)

Clinical Trial
NOPHO ALL92-2000            933
AAML0531                    628
AAML1031                    581
Beat AML Consortium         316
CETLAM SMD-09 (MDS-tAML)    166
French GRAALL 2003–2005     153
TARGET ALL                  141
AAML03P1                     72
Japanese AML05               64
CCG2961                      41
NaN                          41
Name: count, dtype: int64

In [9]:
labels_cog.to_excel(output_path+'labels_cog.xlsx')

In [11]:
labels_beataml.to_excel(output_path+'labels_beataml.xlsx')

In [13]:
labels_amltcga.to_excel(output_path+'labels_amltcga.xlsx')

In [14]:
labels_nordic_all.to_excel(output_path+'labels_nordic_all.xlsx')

In [15]:
labels_mds_taml.to_excel(output_path+'labels_mds_taml.xlsx')

In [16]:
labels_tcell_all_graal.to_excel(output_path+'labels_tcell_all_graal.xlsx')

## MDS_tAML

In [437]:
meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_450k/GSE152710/sample_sheet_meta_data.pkl').iloc[:,:-1].set_index('Sample_ID')

## Nordic ALL

In [188]:
# Load meta data from GSE49031
meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_450k/GSE49031/sample_sheet_meta_data.pkl')\
                        .iloc[:,:-1].set_index('Sample_ID')

# split meta `title` column by the last word
meta['title'] = meta['title'].str.split().str[-1]

# Set index to `title`
meta = meta.reset_index().set_index('title')

# Load clinical data from paper
paper = pd.read_excel('../Data/Raw_Data/Clinical_Data/Nordic_ALL/PMID_25729447_Supp_Clinical_Data.xlsx',
                      index_col=0,header=2, sheet_name='Table S7- Verification summary')[['Karyotyping at diagnosisc']]

# Join meta and paper
meta = meta.join(paper)

# Reset index to `Sample_ID`
meta = meta.reset_index().set_index('Sample_ID')



## Tcell_ALL_GRAAL

In [430]:
meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_EPIC/GSE147667/sample_sheet_meta_data.pkl').iloc[:,:-1].set_index('Sample_ID')

## GDC TARGET ALL

In [464]:
# Load clinical data from GDC
json_clinical_demographic = pd.read_json('../Data/Raw_Data/Methyl_Array_EPIC/GDC_TARGET-ALL/clinical.cases_selection.2023-05-12.json',
                            orient='values')

# flatten json
json_clinical_demographic = pd.json_normalize(json_clinical_demographic['demographic'].dropna())

# extract the second to last term from the `submitter_id` column
json_clinical_demographic['submitter_id'] = json_clinical_demographic['submitter_id'].str.split('-').str[-1]

# extract the first term from the `submitter_id` column by `_`
json_clinical_demographic['submitter_id'] = json_clinical_demographic['submitter_id'].str.split('_').str[0]

# change `submitter_id` column name to `Patient_ID`
json_clinical_demographic = json_clinical_demographic.rename(columns={'submitter_id':'Patient_ID'})

# Set index to `submitter_id`
json_clinical_demographic = json_clinical_demographic.set_index('demographic_id')['Patient_ID']

# Load clinical data from GDC
clinical_tsv = pd.read_csv('../Data/Raw_Data/Methyl_Array_EPIC/GDC_TARGET-ALL/clinical.tsv', 
                            sep='\t', index_col=0)

# Extract the last word from the `case_submitter_id` column by splitting by `-`
clinical_tsv['Patient_ID'] = clinical_tsv['case_submitter_id'].str.split('-').str[-1]

clinical_tsv = clinical_tsv['Patient_ID']

# concat clinical_tsv and json_clinical_demographic
clinical = pd.concat([clinical_tsv, json_clinical_demographic], axis=0, join='outer')

# Set index to `Patient_ID`
clinical = clinical.reset_index().set_index('Patient_ID')

# Load clinical data from paper
paper = pd.read_excel('../Data/Raw_Data/Clinical_Data/ALL_P3_TARGET/41586_2018_436_MOESM4_ESM.xlsx',
                      sheet_name='ST2 Cohort', index_col=0)

# # Join clinical data from paper and GDC
labels_alltarget = clinical.join(paper, how='right')

In [449]:
meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_EPIC/GDC_TARGET-ALL/sample_sheet_meta_data.pkl').set_index('Sentrix_ID')

## TCGA AML

In [105]:
def merge_index_amltcga():

    # load clinical data from GDC
    clinical_tsv = pd.read_csv('../Data/Raw_Data/Methyl_Array_450k/GDC_TCGA-AML/clinical.tsv', 
                    sep='\t', index_col=0)[['case_submitter_id']].drop_duplicates()

    # extract last 4 digits from case_id to get TCGA Patient ID
    clinical_tsv['TCGA Patient ID'] = clinical_tsv['case_submitter_id'].str[-4:]

    # set index to TCGA Patient ID
    clinical_tsv = clinical_tsv.reset_index().set_index('TCGA Patient ID').sort_index()

    # load meta data from NEJM 2013 paper
    meta = pd.read_excel('../Data/Raw_Data/Clinical_Data/TCGA_LAML/SuppTable01_NEJM2013_TCGA_AML.Paper_Mutation data.xlsx',
                        index_col=1).iloc[1:,:].sort_index()

    # make meta index integers
    meta.index = meta.index.astype(int)
    clinical_tsv.index = clinical_tsv.index.astype(int)

    # join clinical_tsv and meta
    labels_amltcga = clinical_tsv.join(meta, how='left')

    # set index to case_id
    labels_amltcga = labels_amltcga.reset_index().set_index('case_id')
    
    return labels_amltcga


## BeatAML Clinical Data

In [39]:
import pandas as pd
def merge_index_beataml ():
    meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_EPIC/GSE159907/sample_sheet_meta_data.pkl').iloc[:,:-1]

    # Create a new column with only the content inside [] from column 'Sample_Name'
    meta['LLS_SampleID'] = meta['Sample_Name'].str.extract(r"\[(.*?)\]", expand=False)

    # Set the index to the new column
    meta1 = meta[['tissue','disease_state','LLS_SampleID','Sample_ID']].set_index('LLS_SampleID')

    # Read in the clinical data
    meta2 = pd.read_excel('../Data/Raw_Data/Clinical_Data/BeatAML/BEAT_AML_Raw clinical data_702.Samples.Vizome.xlsx', index_col=3)

    # Join the two dataframes
    labels_beataml = meta1.join(meta2, how='left').reset_index().set_index('Sample_ID')

    return labels_beataml

labels_beataml = clean_beataml()

## Remove Samples based on Certain Clinical Features

### Remove Relapse Samples

In [5]:
df1 = df_[~df_['Sample Type'].isin(['Relapse', 'Recurrent Blood Derived Cancer - Bone Marrow',
                                    'Recurrent Blood Derived Cancer - Peripheral Blood'])]

print(
    f'Out of {df_.shape[0]} samples, {df_.shape[0]-df1.shape[0]} matched, yielding {df1.shape[0]} samples after filtering')


Out of 1762 samples, 248 matched, yielding 1514 samples after filtering


### Remove Control/Normal Samples

In [6]:
df2 = df1[~df1['Sample Type'].isin(
    ['Bone Marrow Normal', 'Blood Derived Normal'])]
print(
    f'Out of {df1.shape[0]} samples, {df1.shape[0]-df2.shape[0]} matched, yielding {df2.shape[0]} samples after filtering')


Out of 1514 samples, 154 matched, yielding 1360 samples after filtering


### Remove Duplicate Samples

In [7]:
df3 = df2[~df2['Patient_ID'].duplicated(keep='last')]
print(
    f'Out of {df2.shape[0]} samples, {df2.shape[0]-df3.shape[0]} matched, yielding {df3.shape[0]} samples after filtering')


Out of 1360 samples, 14 matched, yielding 1346 samples after filtering


## Save Files

In [8]:
output = df3.join(df_methyl,how='left') # Join clinical data with methyl data

x = output.iloc[:,df3.shape[1]+1:] # Select only methyl data
y = output.iloc[:,0:df3.shape[1]+1] # Select only clinical data

In [9]:
# Split train and test by clinical trial
y_train = y[~y['Clinical Trial'].isin(['AML02', 'AML08'])]
# y_train = y_train[y_train['Sample Type'].isin(['Diagnosis',
#        'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
#        'Primary Blood Derived Cancer - Peripheral Blood',
#        'Blood Derived Normal'])]

y_test = y[y['Clinical Trial'].isin(['AML02', 'AML08'])]

# Select samples in x that are in y_train
x_train = x.loc[y_train.index]
x_test = x.loc[y_test.index]

# x_train = pd.concat([x_train, ctrl_x], axis=0)
# y_train = pd.concat([y_train, ctrl_y], axis=0,keys=['Diagnosis','Control'], names=['sample_type'])


print(
    f"Discovery dataset (train) contains {x_train.shape[1]} rows (5mC sites) and {x_train.shape[0]} columns (samples)")
print(
    f"\n{y_train['Clinical Trial'].value_counts(dropna=False).to_string()}\n")
print(
    f"Validation dataset (test) contains {x_test.shape[1]} rows (5mC sites) and {x_test.shape[0]} columns (samples).")
print(f"\n{y_test['Clinical Trial'].value_counts(dropna=False).to_string()}\n")


Discovery dataset (train) contains 310545 rows (5mC sites) and 1142 columns (samples)

AAML1031    520
AAML0531    508
AML05        64
AAML03P1     36
CCG2961      14

Validation dataset (test) contains 310545 rows (5mC sites) and 204 columns (samples).

AML02    162
AML08     42



## Batch Correction with pyCombat

- __pyCombat__: a Python tool for batch effects correction in high-throughput molecular data using empirical Bayes methods

- __Github__: [https://epigenelabs.github.io/pyComBat/](https://epigenelabs.github.io/pyComBat/)

- __Implementation Paper__: [bioRxiv](https://doi.org/10.1101/2020.03.17.995431)

- __Original Paper__: [Biostatistics](https://pubmed.ncbi.nlm.nih.gov/16632515/)

In [10]:
from combat.pycombat import pycombat

# Correct batch effects in the training dataset
x_train2 = pycombat(x_train.T, y_train['Batch']).T

print('Succesfully corrected batch effects in the training dataset.')


Found 4 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data
Succesfully corrected batch effects in the training dataset.


In [11]:
y_train = y_train[~y_train['Clinical Trial'].isin(['AML05'])]
x_train3 = x_train2.loc[y_train.index]


print(
    f"Discovery dataset (train) contains {x_train3.shape[1]} rows (5mC sites) and {x_train3.shape[0]} columns (samples)")
print(
    f"\n{y_train['Clinical Trial'].value_counts(dropna=False).to_string()}\n")
print(
    f"Validation dataset (test) contains {x_test.shape[1]} rows (5mC sites) and {x_test.shape[0]} columns (samples).")
print(f"\n{y_test['Clinical Trial'].value_counts(dropna=False).to_string()}\n")

Discovery dataset (train) contains 310545 rows (5mC sites) and 1078 columns (samples)

AAML1031    520
AAML0531    508
AAML03P1     36
CCG2961      14

Validation dataset (test) contains 310545 rows (5mC sites) and 204 columns (samples).

AML02    162
AML08     42



In [12]:
x_train3.to_pickle(output_path+'x_train.pkl') # Save methyl data
y_train.to_csv(output_path+'y_train.csv') # Save clinical data

print(
    f'Successfuly saved methyl data in x.pkl and clinical data in y.csv.\nPath: {output_path}')

Successfuly saved methyl data in x.pkl and clinical data in y.csv.
Path: ../Data/Processed_Data/


### Save Control and Relapse Data Separately

In [13]:
controls = df_[df_['Sample Type'].isin(['Bone Marrow Normal'])]

relapse = df_[df_['Sample Type'].isin(['Relapse', 'Recurrent Blood Derived Cancer - Bone Marrow',
                                       'Recurrent Blood Derived Cancer - Peripheral Blood'])]

# Merge control and relapse samples
t = pd.concat([controls, relapse], axis=0, join='outer',
              names=['Control', 'Relapse'])

# Join clinical data with methyl data
t2 = df_methyl.join(t, how='right')

# Save merged control and relapse samples
t2.to_pickle(output_path+'control_relapse.pkl')

print(
    f'Successfuly saved {controls.shape[0]} control samples and {relapse.shape[0]} relapse samples.\nPath: {output_path}')


Successfuly saved 147 control samples and 248 relapse samples.
Path: ../Data/Processed_Data/


## Watermark

In [14]:
%load_ext watermark

In [15]:
# produce a list of the loaded modules
%watermark -v -p pandas

Python implementation: CPython
Python version       : 3.10.10
IPython version      : 8.3.0

pandas: 1.5.3

