In [1]:
import methylcheck
from pathlib import Path

# AAML0531 Data

filepath1 = Path('../Data/Raw_Data/COG_AAML0531_AAML03P1_GSE124413') # Path to AAML0531 data
betas1 = methylcheck.load(filepath1) # Load AAML0531 data

# AAML1031 Data

filepath2 = Path('../Data/Raw_Data/COG_AAML1031_GSE190931/') # Path to AAML1031 data
betas2 = methylcheck.load(filepath2) # Load AAML1031 data

# Japanese AML05 Data

filepath3 = Path('../Data/Raw_Data/AML05_JapaneseTrial_GSE133986/') # Path to Japanese AML05 data
betas3 = methylcheck.load(filepath3) # Load AML05 data

# TARGET450k data from Genomics Data Commons

filepath4 = Path('../Data/Raw_Data/GDC_TARGET_AML_Methyl450K')
betas4 = methylcheck.load(filepath4) # Load TARGET450k data

# AML02 and 08 450K Data

filepath5 = Path('../Data/Raw_Data/LambaPrivate_StJude_AML02_AML08_Methyl450k') # Path to data
betas5 = methylcheck.load(filepath5) # Load data and metadata

Files:   0%|                                                                                                          | 0/1 [00:00<?, ?it/s]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.54s/it]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.55s/it]




INFO:methylcheck.load_processed:loaded data (865859, 500) from 1 pickled files (1.25s)


Files:   0%|                                                                                                          | 0/3 [00:00<?, ?it/s]

Files:  33%|████████████████████████████████▋                                                                 | 1/3 [00:01<00:03,  1.64s/it]

Files:  67%|█████████████████████████████████████████████████████████████████▎                                | 2/3 [00:02<00:01,  1.45s/it]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.04s/it]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.17s/it]




INFO:methylcheck.load_processed:loaded data (865859, 1039) from 3 pickled files (2.953s)


Files:   0%|                                                                                                          | 0/1 [00:00<?, ?it/s]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.65it/s]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.63it/s]


INFO:methylcheck.load_processed:loaded data (865859, 64) from 1 pickled files (0.25s)


Files:   0%|                                                                                                          | 0/1 [00:00<?, ?it/s]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.67it/s]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.66it/s]


INFO:methylcheck.load_processed:loaded data (485512, 317) from 1 pickled files (0.5s)


Files:   0%|                                                                                                          | 0/1 [00:00<?, ?it/s]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.67it/s]

Files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.66it/s]


INFO:methylcheck.load_processed:loaded data (485512, 324) from 1 pickled files (0.516s)


In [2]:
import pandas as pd

betas = pd.concat([betas1,betas2,betas3,betas4, betas5],
                    keys=['GSE124413_AAML0531','GSE190931_AAML1031',
                    'GSE133986_AML05','GDC_TARGET_AML','StJude_AML02_AML08'],
                    join='inner', axis=1)

print(f' Dataset (df) contains {betas.shape[0]} rows (mC sites) and {betas.shape[1]} columns (samples).')

 Dataset (df) contains 452453 rows (mC sites) and 2244 columns (samples).


In [3]:
def exclude_suboptimal_probes(betas):
    '''This function removes proves listed as sub-optimal according to:
    
    Zhou, W., Laird, P. W. & Shen, H.. Comprehensive characterization,
    annotation and innovative use of Infinium DNA methylation BeadChip probes.
    Nucleic Acids Research gkw967 (2016).
    doi:10.1093/nar/gkw967

    For the .tsv file containing the annotated probes, download the paper's
    supplementary material.
    '''
    zhou2016_probes = pd.read_csv('../Data/UnreliableProbesList_Zhou2016/EPIC.anno.GRCh38.tsv', sep='\t',index_col=0)
    unreliable_probes = list(zhou2016_probes[zhou2016_probes['MASK.general'] == True].index)
    betas_ = betas[~betas.index.isin(unreliable_probes)]
    print(f'Of {betas.shape[0]} probes, {betas.shape[0]-betas_.shape[0]} matched, yielding {betas_.shape[0]} after filtering')
    return(betas_)

df1 = exclude_suboptimal_probes(betas)

Of 452453 probes, 47382 matched, yielding 405071 after filtering


In [4]:
df2 = methylcheck.exclude_sex_control_probes(df1, '450k', no_sex=True, no_control=True, verbose=True)

450k: Removed 9570 sex-linked probes from 2244 samples. 395501 probes remaining.


In [5]:
# File path for 1031 has to be uploaded separately since it was processed in batches due to size

filepath2_1 = Path('../Data/Raw_Data/COG_AAML1031_GSE190931/GPL21145_1')
filepath2_2 = Path('../Data/Raw_Data/COG_AAML1031_GSE190931/GPL21145_2')
filepath2_3 = Path('../Data/Raw_Data/COG_AAML1031_GSE190931/GPL21145_3')

# # AAML0531 QC Report
# methylcheck.controls_report(filepath=filepath1)
# # AAML1031 QC Report
# methylcheck.controls_report(filepath=filepath2_1)
# methylcheck.controls_report(filepath=filepath2_2)
# methylcheck.controls_report(filepath=filepath2_3)
# # AML05 QC Report
# methylcheck.controls_report(filepath=filepath3)
# # TARGET450K QC Report
# methylcheck.controls_report(filepath=filepath4)
# # StJude AML02 and 08 QC Report
# methylcheck.controls_report(filepath=filepath5)

In [6]:
# Load QC reports
qc_table1 = pd.read_excel(str(filepath1)+'/GSE124413_QC_Report.xlsx', index_col=0)
qc_table2_1 = pd.read_excel(str(filepath2_1)+'/GPL21145_1_QC_Report.xlsx', index_col=0)
qc_table2_2 = pd.read_excel(str(filepath2_2)+'/GPL21145_2_QC_Report.xlsx', index_col=0)
qc_table2_3 = pd.read_excel(str(filepath2_3)+'/GPL21145_3_QC_Report.xlsx', index_col=0)
qc_table3 = pd.read_excel(str(filepath3)+'/AML05_Japanese_Trial_QC_Report.xlsx', index_col=0)
qc_table4 = pd.read_excel(str(filepath4)+'/TARGET450k_GDC_QC_Report.xlsx', index_col=0)
qc_table5 = pd.read_excel(str(filepath5)+'/StJude_AML02_AML08_QC_Report.xlsx', index_col=0)

# Merge batches
qc_table = pd.concat([qc_table1.iloc[1:], qc_table2_1.iloc[1:],qc_table2_2.iloc[1:], 
                    qc_table2_3.iloc[1:], qc_table3.iloc[1:], qc_table4.iloc[1:],
                    qc_table5.iloc[1:]], axis=0) # Merge batches

In [7]:
qc_failed = qc_table[qc_table['Result'].str.contains('pval')][["Result", 'Why Failed']]
qc_failed

Unnamed: 0_level_0,Result,Why Failed
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1
200991620022_R03C01,FAIL (pval),"Non-polymorphic Green, Non-polymorphic Red"
200973410088_R01C01,FAIL (pval),"Specificity I Red, Non-polymorphic Red"
200973410178_R04C01,FAIL (pval),Non-polymorphic Red
200973410077_R07C01,FAIL (pval),Non-polymorphic Red
202897370026_R03C01,FAIL (pval),"Staining Green, Staining Red, Extension Red, B..."
...,...,...
8784233006_R02C01,FAIL (pval),"Staining Red, Extension Red, Bisulfite Convers..."
8784233006_R03C01,FAIL (pval),"Extension Red, Specificity I Green, Non-polymo..."
8784233006_R04C02,FAIL (pval),"Extension Red, Bisulfite Conversion I Green C/..."
9379082097_R05C01,FAIL (pval),"Staining Green, Staining Red, Extension Green,..."


In [8]:
df3 = df2.drop(list(qc_failed.index),level=1, axis=1)
print(f'{df2.shape[1] - df3.shape[1]} sample(s) removed because: (pOOBAH ≤ 0.05) > 80% probes')

364 sample(s) removed because: (pOOBAH ≤ 0.05) > 80% probes


In [9]:
def probe_cutoff(qc_betas, threshold):
    qc_betas2 = qc_betas.dropna(axis=0, thresh = int(threshold*qc_betas.shape[1]))
    print(f'{qc_betas.shape[0] - qc_betas2.shape[0]} probe(s) removed because of >5% missing values')
    return(qc_betas2)

df4 = probe_cutoff(df3, threshold=0.95)

84956 probe(s) removed because of >5% missing values


In [10]:
df5 = df4.interpolate(axis=0).interpolate(axis=0, limit_direction='backward').round(3)

print(f' Interpolated dataset contains {df5.shape[0]}'
+ f' rows (mC probes) and {df5.shape[1]} columns (samples).')

 Interpolated dataset contains 310545 rows (mC probes) and 1880 columns (samples).


In [11]:
output_path = '../Data/Processed_Data/Methyl_Array_Processed/'

df5.to_pickle(output_path + '2_MethylData_Processing_Output.pkl')

print(
    f'Successfuly saved processed methyl dataset.\nPath: {output_path}')


Successfuly saved processed methyl dataset.
Path: ../Data/Processed_Data/Methyl_Array_Processed/


In [12]:
%load_ext watermark

In [13]:
%watermark -v -p methylcheck,pandas

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.34.0

methylcheck: 0.8.5
pandas     : 1.3.5

