## Preprocess pan-cancer methylation data

Load the downloaded data and curate sample IDs.

In [1]:
import os
import pandas as pd

import mpmp.config as cfg

### Load and process methylation data

In [4]:
# first load manifest file, this tells us the filenames of the raw data files
manifest_df = pd.read_csv(os.path.join(cfg.data_dir, 'manifest.tsv'),
                          sep='\t', index_col=0)
manifest_df.head(1)

Unnamed: 0_level_0,id,filename,md5,size
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mirna_sample,55d9bf6f-0712-4315-b588-e6f8e295018e,PanCanAtlas_miRNA_sample_information_list.txt,02bb56712be34bcd58c50d90387aebde,553408


In [5]:
tcga_methylation_df = pd.read_csv(
    os.path.join(cfg.raw_data_dir, manifest_df.loc['methylation'].filename),
    index_col=0, sep='\t')

print(tcga_methylation_df.shape)
tcga_methylation_df.iloc[:5, :5]

(22601, 12039)


Unnamed: 0_level_0,TCGA-02-0001-01C-01D-0186-05,TCGA-02-0003-01A-01D-0186-05,TCGA-02-0006-01B-01D-0186-05,TCGA-02-0007-01A-01D-0186-05,TCGA-02-0009-01A-01D-0186-05
Composite Element REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cg00000292,0.846378,0.732044,0.678917,0.846951,0.855192
cg00003994,0.066406,0.131451,0.089675,0.040882,0.0661
cg00005847,0.330927,0.839389,0.469196,0.355053,0.605179
cg00007981,0.026121,0.039102,0.021105,0.039487,0.025252
cg00008493,0.925941,0.934192,0.924509,0.930292,0.926921


In [6]:
# remove probes with missing values, and transpose to be a
# samples x probes matrix
tcga_methylation_df = (tcga_methylation_df
    .dropna(axis='rows')
    .transpose()
    .sort_index(axis='rows')
    .sort_index(axis='columns')
)

tcga_methylation_df.index.rename('sample_id', inplace=True)

In [7]:
# update sample IDs to remove multiple samples measured on the same tumor
# and to map with the clinical information
tcga_methylation_df.index = tcga_methylation_df.index.str.slice(start=0, stop=15)
tcga_methylation_df = tcga_methylation_df.loc[~tcga_methylation_df.index.duplicated(), :]

In [8]:
print(tcga_methylation_df.shape)
tcga_methylation_df.iloc[:5, :5]

(11985, 11882)


Composite Element REF,cg00005847,cg00008493,cg00012199,cg00012386,cg00012792
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-01-0628-11,0.568261,0.953532,0.035959,0.021465,0.024005
TCGA-01-0630-11,0.620646,0.959144,0.037645,0.016915,0.028272
TCGA-01-0631-11,0.43055,0.963075,0.036071,0.01738,0.023571
TCGA-01-0633-11,0.416541,0.959544,0.03871,0.025398,0.021352
TCGA-01-0636-11,0.695484,0.961997,0.038338,0.019061,0.017349


In [9]:
tcga_methylation_df.to_csv(cfg.methylation_data, sep='\t', compression='gzip', float_format='%.3g')