## Download multiple modalities of pan-cancer data from TCGA

The data is accessed directly from the [Genome Data Commons](https://gdc.cancer.gov/about-data/publications/pancanatlas).

In [1]:
import os
import pandas as pd
from urllib.request import urlretrieve

import mpmp.config as cfg

First, we load a manifest file containing the GDC API ID and filename for each relevant file, as well as the md5 checksum to make sure the whole/uncorrupted file was downloaded.

The manifest included in this GitHub repo was downloaded from https://gdc.cancer.gov/node/971 on December 1, 2020.

In [2]:
manifest_df = pd.read_csv(os.path.join(cfg.data_dir, 'manifest.tsv'),
                          sep='\t', index_col=0)
manifest_df.head()

Unnamed: 0_level_0,id,filename,md5,size
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mirna_sample,55d9bf6f-0712-4315-b588-e6f8e295018e,PanCanAtlas_miRNA_sample_information_list.txt,02bb56712be34bcd58c50d90387aebde,553408
methylation,d82e2c44-89eb-43d9-b6d3-712732bf6a53,jhu-usc.edu_PANCAN_merged_HumanMethylation27_H...,5cec086f0b002d17befef76a3241e73b,5022150019
rppa,fcbb373e-28d4-4818-92f3-601ede3da5e1,TCGA-RPPA-pancan-clean.txt,e2b914c7ecd369589275d546d9555b05,18901234
rna_seq,3586c0da-64d0-4b74-a449-5ff4d9136611,EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2....,02e72c33071307ff6570621480d3c90b,1882540959
mirna,1c6174d9-8ffb-466e-b5ee-07b204c15cf8,pancanMiRs_EBadjOnProtocolPlatformWithoutRepsW...,c7501dc3c505ca172a6a05b611bd11c3,67167640


### Download gene expression data

In [3]:
if not os.path.exists(cfg.raw_data_dir):
    os.makedirs(cfg.raw_data_dir)
    
rnaseq_id, rnaseq_filename = manifest_df.loc['rna_seq'].id, manifest_df.loc['rna_seq'].filename
url = 'http://api.gdc.cancer.gov/data/{}'.format(rnaseq_id)
exp_filepath = os.path.join(cfg.raw_data_dir, rnaseq_filename)

if not os.path.exists(exp_filepath):
    urlretrieve(url, exp_filepath)
else:
    print('Downloaded data file already exists, skipping download')

Downloaded data file already exists, skipping download


In [4]:
md5_sum = !md5sum $exp_filepath
print(md5_sum[0])
assert md5_sum[0].split(' ')[0] == manifest_df.loc['rna_seq'].md5

02e72c33071307ff6570621480d3c90b  /home/jake/research/mpmp/data/raw/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv


### Download DNA methylation data

In [5]:
me_id, me_filename = manifest_df.loc['methylation'].id, manifest_df.loc['methylation'].filename
url = 'http://api.gdc.cancer.gov/data/{}'.format(me_id)
me_filepath = os.path.join(cfg.raw_data_dir, me_filename)

if not os.path.exists(me_filepath):
    urlretrieve(url, me_filepath)
else:
    print('Downloaded data file already exists, skipping download')

Downloaded data file already exists, skipping download


In [6]:
md5_sum = !md5sum $me_filepath
print(md5_sum[0])
assert md5_sum[0].split(' ')[0] == manifest_df.loc['methylation'].md5

5cec086f0b002d17befef76a3241e73b  /home/jake/research/mpmp/data/raw/jhu-usc.edu_PANCAN_merged_HumanMethylation27_HumanMethylation450.betaValue_whitelisted.tsv


### Download RPPA data

In [7]:
rppa_id, rppa_filename = manifest_df.loc['rppa'].id, manifest_df.loc['rppa'].filename
url = 'http://api.gdc.cancer.gov/data/{}'.format(rppa_id)
rppa_filepath = os.path.join(cfg.raw_data_dir, rppa_filename)

if not os.path.exists(rppa_filepath):
    urlretrieve(url, rppa_filepath)
else:
    print('Downloaded data file already exists, skipping download')

Downloaded data file already exists, skipping download


In [8]:
md5_sum = !md5sum $rppa_filepath
print(md5_sum[0])
assert md5_sum[0].split(' ')[0] == manifest_df.loc['rppa'].md5

e2b914c7ecd369589275d546d9555b05  /home/jake/research/mpmp/data/raw/TCGA-RPPA-pancan-clean.txt


### Download miRNA data

In [9]:
mirna_id, mirna_filename = manifest_df.loc['mirna'].id, manifest_df.loc['mirna'].filename
url = 'http://api.gdc.cancer.gov/data/{}'.format(mirna_id)
mirna_filepath = os.path.join(cfg.raw_data_dir, mirna_filename)

if not os.path.exists(mirna_filepath):
    urlretrieve(url, mirna_filepath)
else:
    print('Downloaded data file already exists, skipping download')

Downloaded data file already exists, skipping download


In [10]:
md5_sum = !md5sum $mirna_filepath
print(md5_sum[0])
assert md5_sum[0].split(' ')[0] == manifest_df.loc['mirna'].md5

c7501dc3c505ca172a6a05b611bd11c3  /home/jake/research/mpmp/data/raw/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.csv
