# Clinical Data Processing

## Where the data at?

In [1]:
input_path = '../Data/Intermediate_Files/'
clinicaldata_path = '../Data/Raw_Data/Clinical_Data/'
output_path = '../Data/Processed_Data/'

## Load Methyl Data

In [2]:
import pandas as pd

df_methyl = pd.read_pickle(
    input_path+'2_MethylData_Processing_Output.pkl')

# .T.reset_index(level=0, names='Batch')

print(
    f' Dataset (df) contains {df_methyl.shape[1]} columns (mC sites) and {df_methyl.shape[0]} rows (samples).')


 Dataset (df) contains 333249 columns (mC sites) and 3357 rows (samples).


In [3]:
df_methyl['Batch'].value_counts(dropna=False)

Batch
GSE49031          933
GSE190931         581
GSE124413         495
GSE159907         316
GDC_TARGET-AML    314
GDC_TCGA-AML      194
GSE152710         166
GSE147667         153
GDC_TARGET-ALL    141
GSE133986          64
Name: count, dtype: int64

## Add Labels/Clinical Outcome Data

In [4]:
# Import functions to clean up clinical data
from FM_Functions.Clinical_Data_CleanUp import *

# Combine all clinical data files into one dataframe and indexes it by the sample ID
labels_cog, labels_aml02, labels_aml08, labels_aml05 = combine_and_index_clinicaldata()
# Clean up and adjust clinical data labels
labels_aml02 = clean_aml02(labels_aml02)
labels_aml08 = clean_aml08(labels_aml08)
labels_cog   = clean_cog(labels_cog)
labels_aml05 = clean_aml05(labels_aml05)

# Combine all clinical data labels
df = pd.concat([labels_aml02, labels_aml08, labels_cog,
               labels_aml05], axis=0, join='outer')

# Remove samples that are not in the methyl dataset
# df = df.loc[df.index.isin(df_methyl.index)]

# Label control samples from the AML0531 clinical trial (GSE124413) as 'Bone Marrow Normal'

def label_control_samples(df_methyl, df):
    """
    This function labels control samples from the AML0531 clinical trial (GSE124413) as 'Bone Marrow Normal'
    and combines them with the clinical trial samples.
    """
    a = df_methyl[df_methyl['Batch'].isin(['GSE124413'])]
    b = df[df.index.isin(a.index)]
    control_0531 = a[~a.index.isin(b.index)]
    control_0531['Sample Type'] = 'Bone Marrow Normal'
    df_ = pd.concat(
        [df, control_0531['Sample Type'].to_frame()], axis=0, join='outer')
    return df_

df_ = label_control_samples(df_methyl, df)


In [5]:
df.shape

(2282, 291)

## MDS_tAML

In [437]:
meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_450k/GSE152710/sample_sheet_meta_data.pkl').iloc[:,:-1].set_index('Sample_ID')

## Nordic ALL

In [188]:
# Load meta data from GSE49031
meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_450k/GSE49031/sample_sheet_meta_data.pkl')\
                        .iloc[:,:-1].set_index('Sample_ID')

# split meta `title` column by the last word
meta['title'] = meta['title'].str.split().str[-1]

# Set index to `title`
meta = meta.reset_index().set_index('title')

# Load clinical data from paper
paper = pd.read_excel('../Data/Raw_Data/Clinical_Data/Nordic_ALL/PMID_25729447_Supp_Clinical_Data.xlsx',
                      index_col=0,header=2, sheet_name='Table S7- Verification summary')[['Karyotyping at diagnosisc']]

# Join meta and paper
meta = meta.join(paper)

# Reset index to `Sample_ID`
meta = meta.reset_index().set_index('Sample_ID')



## Tcell_ALL_GRAAL

In [430]:
meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_EPIC/GSE147667/sample_sheet_meta_data.pkl').iloc[:,:-1].set_index('Sample_ID')

## GDC TARGET ALL

In [442]:
# Load clinical data from GDC
json_clinical_demographic = pd.read_json('../Data/Raw_Data/Methyl_Array_EPIC/GDC_TARGET-ALL/clinical.cases_selection.2023-05-12.json',
                            orient='values')

# flatten json
json_clinical_demographic = pd.json_normalize(json_clinical_demographic['demographic'].dropna())

# extract the second to last term from the `submitter_id` column
json_clinical_demographic['submitter_id'] = json_clinical_demographic['submitter_id'].str.split('-').str[-1]

# extract the first term from the `submitter_id` column by `_`
json_clinical_demographic['submitter_id'] = json_clinical_demographic['submitter_id'].str.split('_').str[0]

# change `submitter_id` column name to `Patient_ID`
json_clinical_demographic = json_clinical_demographic.rename(columns={'submitter_id':'Patient_ID'})

# Set index to `submitter_id`
json_clinical_demographic = json_clinical_demographic.set_index('demographic_id')['Patient_ID']

# Load clinical data from GDC
clinical_tsv = pd.read_csv('../Data/Raw_Data/Methyl_Array_EPIC/GDC_TARGET-ALL/clinical.tsv', 
                            sep='\t', index_col=0)

# Extract the last word from the `case_submitter_id` column by splitting by `-`
clinical_tsv['Patient_ID'] = clinical_tsv['case_submitter_id'].str.split('-').str[-1]

clinical_tsv = clinical_tsv['Patient_ID']

# concat clinical_tsv and json_clinical_demographic
clinical = pd.concat([clinical_tsv, json_clinical_demographic], axis=0, join='outer')

# Set index to `Patient_ID`
clinical = clinical.reset_index().set_index('Patient_ID')

# Load clinical data from paper
paper = pd.read_excel('../Data/Raw_Data/Clinical_Data/ALL_P3_TARGET/41586_2018_436_MOESM4_ESM.xlsx',
                      sheet_name='ST2 Cohort', index_col=0)

# # Join clinical data from paper and GDC
labels_alltarget = clinical.join(paper, how='right')

In [449]:
meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_EPIC/GDC_TARGET-ALL/sample_sheet_meta_data.pkl').set_index('Sentrix_ID')

In [456]:
clinical_tsv.to_frame().join(meta, how='inner')

Unnamed: 0,Patient_ID,Sentrix_Position,Sample_Group,Sample_Name,Sample_Plate,Sample_Type,Sub_Type,Sample_Well,Pool_ID,GSM_ID,Control,Sample_ID


In [450]:
meta

Unnamed: 0_level_0,Sentrix_Position,Sample_Group,Sample_Name,Sample_Plate,Sample_Type,Sub_Type,Sample_Well,Pool_ID,GSM_ID,Control,Sample_ID
Sentrix_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a19169b4-294c-437f-957f-fbf6f6a30bbc,noid,,Sample_1,,Unknown,,,,,False,a19169b4-294c-437f-957f-fbf6f6a30bbc_noid
c82c30f6-045e-432c-a561-215f1ce5f81a,noid,,Sample_2,,Unknown,,,,,False,c82c30f6-045e-432c-a561-215f1ce5f81a_noid
f3c62e33-2e54-41c7-a1b1-354314300ec5,noid,,Sample_3,,Unknown,,,,,False,f3c62e33-2e54-41c7-a1b1-354314300ec5_noid
7cb9cfd2-4ee8-4017-a7e4-b7827fc56e9b,noid,,Sample_4,,Unknown,,,,,False,7cb9cfd2-4ee8-4017-a7e4-b7827fc56e9b_noid
561e02de-8c81-4331-a8f6-f7798de22f8c,noid,,Sample_5,,Unknown,,,,,False,561e02de-8c81-4331-a8f6-f7798de22f8c_noid
...,...,...,...,...,...,...,...,...,...,...,...
b57ee3da-cda3-432f-8ff0-9f8d2b29d5a1,noid,,Sample_137,,Unknown,,,,,False,b57ee3da-cda3-432f-8ff0-9f8d2b29d5a1_noid
b2667850-1833-4f9a-af89-3441596905bc,noid,,Sample_138,,Unknown,,,,,False,b2667850-1833-4f9a-af89-3441596905bc_noid
5cf29dcd-09bb-468c-9bea-b7a6d1e98370,noid,,Sample_139,,Unknown,,,,,False,5cf29dcd-09bb-468c-9bea-b7a6d1e98370_noid
2a09228b-6833-4050-8255-3294cd16640b,noid,,Sample_140,,Unknown,,,,,False,2a09228b-6833-4050-8255-3294cd16640b_noid


## TCGA AML

In [105]:
def merge_index_amltcga():

    # load clinical data from GDC
    clinical_tsv = pd.read_csv('../Data/Raw_Data/Methyl_Array_450k/GDC_TCGA-AML/clinical.tsv', 
                    sep='\t', index_col=0)[['case_submitter_id']].drop_duplicates()

    # extract last 4 digits from case_id to get TCGA Patient ID
    clinical_tsv['TCGA Patient ID'] = clinical_tsv['case_submitter_id'].str[-4:]

    # set index to TCGA Patient ID
    clinical_tsv = clinical_tsv.reset_index().set_index('TCGA Patient ID').sort_index()

    # load meta data from NEJM 2013 paper
    meta = pd.read_excel('../Data/Raw_Data/Clinical_Data/TCGA_LAML/SuppTable01_NEJM2013_TCGA_AML.Paper_Mutation data.xlsx',
                        index_col=1).iloc[1:,:].sort_index()

    # make meta index integers
    meta.index = meta.index.astype(int)
    clinical_tsv.index = clinical_tsv.index.astype(int)

    # join clinical_tsv and meta
    labels_amltcga = clinical_tsv.join(meta, how='left')

    # set index to case_id
    labels_amltcga = labels_amltcga.reset_index().set_index('case_id')
    
    return labels_amltcga


## BeatAML Clinical Data

In [39]:
import pandas as pd
def merge_index_beataml ():
    meta = pd.read_pickle('../Data/Raw_Data/Methyl_Array_EPIC/GSE159907/sample_sheet_meta_data.pkl').iloc[:,:-1]

    # Create a new column with only the content inside [] from column 'Sample_Name'
    meta['LLS_SampleID'] = meta['Sample_Name'].str.extract(r"\[(.*?)\]", expand=False)

    # Set the index to the new column
    meta1 = meta[['tissue','disease_state','LLS_SampleID','Sample_ID']].set_index('LLS_SampleID')

    # Read in the clinical data
    meta2 = pd.read_excel('../Data/Raw_Data/Clinical_Data/BeatAML/BEAT_AML_Raw clinical data_702.Samples.Vizome.xlsx', index_col=3)

    # Join the two dataframes
    labels_beataml = meta1.join(meta2, how='left').reset_index().set_index('Sample_ID')

    return labels_beataml

labels_beataml = clean_beataml()

## Remove Samples based on Certain Clinical Features

### Remove Relapse Samples

In [5]:
df1 = df_[~df_['Sample Type'].isin(['Relapse', 'Recurrent Blood Derived Cancer - Bone Marrow',
                                    'Recurrent Blood Derived Cancer - Peripheral Blood'])]

print(
    f'Out of {df_.shape[0]} samples, {df_.shape[0]-df1.shape[0]} matched, yielding {df1.shape[0]} samples after filtering')


Out of 1762 samples, 248 matched, yielding 1514 samples after filtering


### Remove Control/Normal Samples

In [6]:
df2 = df1[~df1['Sample Type'].isin(
    ['Bone Marrow Normal', 'Blood Derived Normal'])]
print(
    f'Out of {df1.shape[0]} samples, {df1.shape[0]-df2.shape[0]} matched, yielding {df2.shape[0]} samples after filtering')


Out of 1514 samples, 154 matched, yielding 1360 samples after filtering


### Remove Duplicate Samples

In [7]:
df3 = df2[~df2['Patient_ID'].duplicated(keep='last')]
print(
    f'Out of {df2.shape[0]} samples, {df2.shape[0]-df3.shape[0]} matched, yielding {df3.shape[0]} samples after filtering')


Out of 1360 samples, 14 matched, yielding 1346 samples after filtering


## Save Files

In [8]:
output = df3.join(df_methyl,how='left') # Join clinical data with methyl data

x = output.iloc[:,df3.shape[1]+1:] # Select only methyl data
y = output.iloc[:,0:df3.shape[1]+1] # Select only clinical data

In [9]:
# Split train and test by clinical trial
y_train = y[~y['Clinical Trial'].isin(['AML02', 'AML08'])]
# y_train = y_train[y_train['Sample Type'].isin(['Diagnosis',
#        'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
#        'Primary Blood Derived Cancer - Peripheral Blood',
#        'Blood Derived Normal'])]

y_test = y[y['Clinical Trial'].isin(['AML02', 'AML08'])]

# Select samples in x that are in y_train
x_train = x.loc[y_train.index]
x_test = x.loc[y_test.index]

# x_train = pd.concat([x_train, ctrl_x], axis=0)
# y_train = pd.concat([y_train, ctrl_y], axis=0,keys=['Diagnosis','Control'], names=['sample_type'])


print(
    f"Discovery dataset (train) contains {x_train.shape[1]} rows (5mC sites) and {x_train.shape[0]} columns (samples)")
print(
    f"\n{y_train['Clinical Trial'].value_counts(dropna=False).to_string()}\n")
print(
    f"Validation dataset (test) contains {x_test.shape[1]} rows (5mC sites) and {x_test.shape[0]} columns (samples).")
print(f"\n{y_test['Clinical Trial'].value_counts(dropna=False).to_string()}\n")


Discovery dataset (train) contains 310545 rows (5mC sites) and 1142 columns (samples)

AAML1031    520
AAML0531    508
AML05        64
AAML03P1     36
CCG2961      14

Validation dataset (test) contains 310545 rows (5mC sites) and 204 columns (samples).

AML02    162
AML08     42



## Batch Correction with pyCombat

- __pyCombat__: a Python tool for batch effects correction in high-throughput molecular data using empirical Bayes methods

- __Github__: [https://epigenelabs.github.io/pyComBat/](https://epigenelabs.github.io/pyComBat/)

- __Implementation Paper__: [bioRxiv](https://doi.org/10.1101/2020.03.17.995431)

- __Original Paper__: [Biostatistics](https://pubmed.ncbi.nlm.nih.gov/16632515/)

In [10]:
from combat.pycombat import pycombat

# Correct batch effects in the training dataset
x_train2 = pycombat(x_train.T, y_train['Batch']).T

print('Succesfully corrected batch effects in the training dataset.')


Found 4 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data
Succesfully corrected batch effects in the training dataset.


In [11]:
y_train = y_train[~y_train['Clinical Trial'].isin(['AML05'])]
x_train3 = x_train2.loc[y_train.index]


print(
    f"Discovery dataset (train) contains {x_train3.shape[1]} rows (5mC sites) and {x_train3.shape[0]} columns (samples)")
print(
    f"\n{y_train['Clinical Trial'].value_counts(dropna=False).to_string()}\n")
print(
    f"Validation dataset (test) contains {x_test.shape[1]} rows (5mC sites) and {x_test.shape[0]} columns (samples).")
print(f"\n{y_test['Clinical Trial'].value_counts(dropna=False).to_string()}\n")

Discovery dataset (train) contains 310545 rows (5mC sites) and 1078 columns (samples)

AAML1031    520
AAML0531    508
AAML03P1     36
CCG2961      14

Validation dataset (test) contains 310545 rows (5mC sites) and 204 columns (samples).

AML02    162
AML08     42



In [12]:
x_train3.to_pickle(output_path+'x_train.pkl') # Save methyl data
y_train.to_csv(output_path+'y_train.csv') # Save clinical data

print(
    f'Successfuly saved methyl data in x.pkl and clinical data in y.csv.\nPath: {output_path}')

Successfuly saved methyl data in x.pkl and clinical data in y.csv.
Path: ../Data/Processed_Data/


### Save Control and Relapse Data Separately

In [13]:
controls = df_[df_['Sample Type'].isin(['Bone Marrow Normal'])]

relapse = df_[df_['Sample Type'].isin(['Relapse', 'Recurrent Blood Derived Cancer - Bone Marrow',
                                       'Recurrent Blood Derived Cancer - Peripheral Blood'])]

# Merge control and relapse samples
t = pd.concat([controls, relapse], axis=0, join='outer',
              names=['Control', 'Relapse'])

# Join clinical data with methyl data
t2 = df_methyl.join(t, how='right')

# Save merged control and relapse samples
t2.to_pickle(output_path+'control_relapse.pkl')

print(
    f'Successfuly saved {controls.shape[0]} control samples and {relapse.shape[0]} relapse samples.\nPath: {output_path}')


Successfuly saved 147 control samples and 248 relapse samples.
Path: ../Data/Processed_Data/


## Watermark

In [14]:
%load_ext watermark

In [15]:
# produce a list of the loaded modules
%watermark -v -p pandas

Python implementation: CPython
Python version       : 3.10.10
IPython version      : 8.3.0

pandas: 1.5.3

