## Download multiple modalities of pan-cancer data from TCGA

This notebook is modified from the original in [mpmp](https://github.com/greenelab/mpmp/blob/5852824c1b00921a298be0cdebb253a272ea0d39/00_download_data/0_download_data.ipynb) and is used under the terms of the BSD-3-Clause

The data is accessed directly from the [Genome Data Commons](https://gdc.cancer.gov/about-data/publications/pancanatlas).

NOTE: this download script uses the `md5sum` shell utility to verify file hashes. This script was developed and tested on a Linux machine, and `md5sum` commands may have to be changed to work on other platforms.

In [2]:
import os
import pandas as pd
from urllib.request import urlretrieve

First, we load a manifest file containing the GDC API ID and filename for each relevant file, as well as the md5 checksum to make sure the whole/uncorrupted file was downloaded.

In [3]:
data_dir = '../data'
manifest_df = pd.read_csv(os.path.join(data_dir, 'manifest.tsv'),
                          sep='\t', index_col=0)
manifest_df['name'] = 'unused'
manifest_df.loc[manifest_df['filename'].str.contains('RNASeq'), 'name'] = 'rna_seq'
manifest_df.loc[manifest_df['filename'].str.contains('PUBLIC.maf'), 'name'] = 'mutations'
manifest_df = manifest_df.reset_index()
manifest_df = manifest_df.set_index('name')
manifest_df.head()

Unnamed: 0_level_0,id,filename,md5,size
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
unused,7d4c0344-f018-4ab0-949a-09815f483480,merge_merged_reals.tar.gz,ff8bf50dcd3a314162af71d1b8e538b6,388646603
unused,0f4f5701-7b61-41ae-bda9-2805d1ca9781,TCGA_mastercalls.abs_segtabs.fixed.txt,585c8793730f294d7bf0144566bb37fa,253061161
unused,1a7d7be8-675d-4e60-a105-19d4121bdebf,merged_sample_quality_annotations.tsv,05ddd2270fb1fb24fbdc2fe9bf7384e5,8463670
unused,55d9bf6f-0712-4315-b588-e6f8e295018e,PanCanAtlas_miRNA_sample_information_list.txt,02bb56712be34bcd58c50d90387aebde,553408
unused,d82e2c44-89eb-43d9-b6d3-712732bf6a53,jhu-usc.edu_PANCAN_merged_HumanMethylation27_H...,5cec086f0b002d17befef76a3241e73b,5022150019


### Download gene expression data

In [17]:
os.makedirs(data_dir, exist_ok=True)
    
rnaseq_id, rnaseq_filename = manifest_df.loc['rna_seq'].id, manifest_df.loc['rna_seq'].filename
url = 'http://api.gdc.cancer.gov/data/{}'.format(rnaseq_id)
exp_filepath = os.path.join(data_dir, rnaseq_filename)

if not os.path.exists(exp_filepath) and not os.path.exists(os.path.join(data_dir, 'tcga_expression.tsv')):
    urlretrieve(url, exp_filepath)
else:
    print('Downloaded data file already exists, skipping download')

Downloaded data file already exists, skipping download


In [15]:
!mv $exp_filepath $data_dir/tcga_expression.tsv

mv: cannot stat '../data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv': No such file or directory


In [16]:
md5_sum = !md5sum $data_dir/tcga_expression.tsv
print(md5_sum[0])
assert md5_sum[0].split(' ')[0] == manifest_df.loc['rna_seq'].md5

02e72c33071307ff6570621480d3c90b  ../data/tcga_expression.tsv


### Download mutation data

In [7]:
base_url = "https://github.com/greenelab/pancancer/raw/{}/data/pancan_mutation_freeze.tsv.gz"
commit = "2a0683b68017fb226f4053e63415e4356191734f"  
    
url = base_url.format(commit)
exp_filepath = os.path.join(data_dir, 'mutations.tsv.gz')

if not os.path.exists(exp_filepath):
    urlretrieve(url, exp_filepath)
else:
    print('Downloaded data file already exists, skipping download')

In [8]:
!gunzip $exp_filepath

## Prototype data processing

In [9]:
import pandas as pd

In [24]:
expression_df = pd.read_csv(os.path.join(data_dir, 'tcga_expression.tsv'), sep='\t', index_col=0)

In [25]:
mutation_df = pd.read_csv(os.path.join(data_dir, 'mutations.tsv'), sep='\t', index_col=0)

In [26]:
expression_df

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,TCGA-OR-A5JB-01A-11R-A29S-07,...,TCGA-CG-4449-01A-01R-1157-13,TCGA-CG-4462-01A-01R-1157-13,TCGA-CG-4465-01A-01R-1157-13,TCGA-CG-4466-01A-01R-1157-13,TCGA-CG-4469-01A-01R-1157-13,TCGA-CG-4472-01A-01R-1157-13,TCGA-CG-4474-01A-02R-1157-13,TCGA-CG-4475-01A-01R-1157-13,TCGA-CG-4476-01A-01R-1157-13,TCGA-CG-4477-01A-01R-1157-13
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
?|100130426,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,,,,,,,,,,
?|100133144,3.2661,2.6815,1.7301,0.0000,0.0000,1.1673,1.4422,0.0000,4.4556,7.1293,...,4.358154,5.676995,5.219350,14.846708,20.115492,6.997533,18.311906,12.057112,18.628740,17.874417
?|100134869,3.9385,8.9948,6.5650,1.5492,4.4709,6.0529,2.2876,1.3599,5.0581,0.0000,...,2.656360,3.342794,2.423442,5.055287,11.626054,13.654193,7.417109,11.585177,11.482418,14.919338
?|10357,149.1350,81.0777,86.4879,53.9117,66.9063,103.5060,94.9316,78.1955,69.2389,155.7090,...,633.299781,294.018042,686.569179,563.573453,1039.307597,639.238135,742.479964,506.336449,712.452165,703.713324
?|10431,2034.1000,1304.9300,1054.6600,2350.8900,1257.9900,1866.4300,995.0270,1762.1200,1213.5300,2005.5700,...,1202.538277,644.002317,1181.884532,663.885074,647.530395,1297.152549,1152.909807,1375.495774,971.893874,1736.988111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A|440590,0.4803,31.4052,0.5925,11.6189,7.8240,85.4392,0.4144,2.3799,1.0571,165.7570,...,20.923873,1.839530,2.916935,239.014921,1.845753,3.268489,17.164493,3.756246,0.301440,217.431795
ZYG11B|79699,648.4150,1166.0200,806.3990,553.8340,795.8120,520.6580,556.1540,913.1870,805.4970,784.6720,...,1322.386301,1025.213701,814.306556,907.845035,953.276441,905.046317,757.811259,927.963540,845.677334,859.078048
ZYX|7791,1841.0200,3059.9900,2655.6100,2367.9300,708.0710,855.1940,10924.6000,2122.1600,1939.2200,4050.3500,...,2783.898049,4960.431833,3447.701267,978.304677,2789.057736,3359.241568,4264.469081,3103.609391,3302.569055,2497.814797
ZZEF1|23140,1157.5400,1895.9900,1482.4500,1140.2000,796.3710,897.7140,1095.7300,1003.6200,904.8630,370.2800,...,1284.992478,2054.896390,2420.047163,1302.821382,1119.313995,1740.926312,2702.668453,1370.141309,1915.477072,1247.130940


In [35]:
expression_df[expression_df.isna().sum(axis=1) > 0]

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,TCGA-OR-A5JB-01A-11R-A29S-07,...,TCGA-CG-4449-01A-01R-1157-13,TCGA-CG-4462-01A-01R-1157-13,TCGA-CG-4465-01A-01R-1157-13,TCGA-CG-4466-01A-01R-1157-13,TCGA-CG-4469-01A-01R-1157-13,TCGA-CG-4472-01A-01R-1157-13,TCGA-CG-4474-01A-02R-1157-13,TCGA-CG-4475-01A-01R-1157-13,TCGA-CG-4476-01A-01R-1157-13,TCGA-CG-4477-01A-01R-1157-13
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
?|100130426,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.0000,0.000,0.0000,0.0,...,,,,,,,,,,
?|136542,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.0000,0.000,0.0000,0.0,...,,,,,,,,,,
?|26823,1.4409,0.0000,0.5925,0.7746,0.000,0.0000,1.6577,0.000,2.1142,0.0,...,0.489337,0.527482,0.087451,1.201876,0.087451,0.513451,0.497339,0.087451,1.303840,0.087451
?|280660,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.0000,0.000,0.0000,0.0,...,,,,,,,,,,
?|317712,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.0000,0.000,0.0000,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZPBP2|124626,0.0000,0.4026,0.0000,0.0000,0.000,0.0000,0.4144,0.000,0.0000,0.0,...,,,,,,,,,,
ZPLD1|131368,0.0000,1.6105,0.5925,0.0000,0.000,0.0000,7.0452,0.680,0.0000,0.0,...,,,,,,,,,,
ZSCAN1|284312,0.9606,12.0789,20.7377,83.6561,7.824,15.2427,1.2433,52.017,67.1247,0.0,...,,,,,,,,,,
ZSCAN4|201516,0.4803,0.8053,0.0000,0.0000,0.000,0.4011,0.0000,0.000,0.5285,0.0,...,14.353376,26.811432,0.211351,4.840275,-0.094774,1.633818,4.861571,1.116053,12.214718,2.242279


In [45]:
expression_df.loc[:,expression_df.isna().sum() == 0]

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,TCGA-OR-A5JB-01A-11R-A29S-07,...,TCGA-ZH-A8Y6-01A-11R-A41I-07,TCGA-ZH-A8Y8-01A-51R-A41I-07,TCGA-ZU-A8S4-01A-11R-A41I-07,TCGA-ZU-A8S4-11A-11R-A41I-07,TCGA-IC-A6RF-01A-13R-A336-31,TCGA-IC-A6RF-11A-21R-A336-31,TCGA-L5-A88T-01A-11R-A354-31,TCGA-CG-4460-01A-01R-1157-13,TCGA-CG-5716-01A-21R-1802-13,TCGA-HF-7131-01A-11R-2055-13
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
?|100130426,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.3752,0.0000,0.0000,0.4145
?|100133144,3.2661,2.6815,1.7301,0.0000,0.0000,1.1673,1.4422,0.0000,4.4556,7.1293,...,12.0874,12.8354,1.4409,1.7594,13.5173,31.5470,100.4940,19.6298,28.6589,5.2112
?|100134869,3.9385,8.9948,6.5650,1.5492,4.4709,6.0529,2.2876,1.3599,5.0581,0.0000,...,7.8019,15.9712,2.5477,1.4988,7.4984,33.0939,103.2220,15.7466,32.3721,10.4642
?|10357,149.1350,81.0777,86.4879,53.9117,66.9063,103.5060,94.9316,78.1955,69.2389,155.7090,...,139.1880,61.8354,132.5240,74.5696,15.9720,17.6796,12.6227,107.8940,0.0000,0.6967
?|10431,2034.1000,1304.9300,1054.6600,2350.8900,1257.9900,1866.4300,995.0270,1762.1200,1213.5300,2005.5700,...,783.1880,975.3090,1309.2400,976.3780,1221.4400,1017.6800,584.5020,1095.0200,983.5630,935.2960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A|440590,0.4803,31.4052,0.5925,11.6189,7.8240,85.4392,0.4144,2.3799,1.0571,165.7570,...,0.3753,1.6461,0.4986,48.8732,32.2242,6.6298,23.4923,4.1135,2.5697,0.3483
ZYG11B|79699,648.4150,1166.0200,806.3990,553.8340,795.8120,520.6580,556.1540,913.1870,805.4970,784.6720,...,947.1810,1271.6000,726.4120,2652.1900,827.7410,849.7240,906.7320,804.6070,568.5520,780.6320
ZYX|7791,1841.0200,3059.9900,2655.6100,2367.9300,708.0710,855.1940,10924.6000,2122.1600,1939.2200,4050.3500,...,1859.8400,3953.9100,10561.6000,2042.9000,3931.3500,7283.9800,6919.7100,2491.9800,6013.1600,10838.6000
ZZEF1|23140,1157.5400,1895.9900,1482.4500,1140.2000,796.3710,897.7140,1095.7300,1003.6200,904.8630,370.2800,...,2264.0000,957.6130,998.6290,715.7210,3106.4100,2848.0700,3294.8800,1650.3500,3154.3400,2024.9100


In [52]:
example_df = nonan_df.iloc[:4,:4]
example_df

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
?|100130426,0.0,0.0,0.0,0.0
?|100133144,3.2661,2.6815,1.7301,0.0
?|100134869,3.9385,8.9948,6.565,1.5492
?|10357,149.135,81.0777,86.4879,53.9117


In [55]:
example_df.div(example_df.sum(), axis=1)

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
?|100130426,0.0,0.0,0.0,0.0
?|100133144,0.020891,0.02891,0.018253,0.0
?|100134869,0.025192,0.096975,0.069263,0.027933
?|10357,0.953917,0.874115,0.912483,0.972067


In [27]:
mutation_df

Unnamed: 0_level_0,5S_rRNA,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,...,ZYX,ZZEF1,ZZZ3,hsa-mir-1199,hsa-mir-150,hsa-mir-3171,hsa-mir-466,hsa-mir-5195,hsa-mir-6080,hsa-mir-7162
SAMPLE_BARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-0047-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-0055-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-2483-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-2485-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-2486-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZS-A9CF-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-ZS-A9CG-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-ZT-A8OM-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-ZU-A8S4-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
