In [1]:
from pathlib import Path

import pandas as pd

pd.set_option("display.max_rows", 5)
pd.set_option("display.max_columns", 10)
pd.set_option("display.max_colwidth", 10)
pd.set_option("display.width", 100)

In [None]:
# local paths
data_dir_dst = Path("data/download")
data_dir_dst.mkdir(parents=True, exist_ok=True)

In [3]:
# data urls
expression_url = 'https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.LUAD.sampleMap%2FHiSeqV2.gz' # RSEM log2(x+1) per gene Ã— sample
clinical_url   = 'https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.LUAD.sampleMap%2FLUAD_clinicalMatrix'
survival_url   = 'https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/survival%2FLUAD_survival.txt'

# download raw data

In [4]:
expression = pd.read_csv(expression_url, sep='\t', index_col=0)
print(expression.shape)
print(expression.head())

(20530, 576)
           TCGA-69-7978-01  TCGA-62-8399-01  TCGA-78-7539-01  TCGA-50-5931-11  TCGA-73-4658-01  \
sample                                                                                           
ARHGEF10L     9.9898          10.4257           9.6264           8.6835           9.2078         
HIF3A         4.2598          11.6239           9.1362           9.4824           5.0288         
RNF17         0.4181           0.0000           1.1231           0.8221           0.0000         
RNF10        10.3657          11.5489          11.6692          11.7341          11.6209         
RNF11        11.1718          11.0200          10.4679          11.6787          11.3414         

           ...  TCGA-55-7727-01  TCGA-91-6831-01  TCGA-MN-A4N4-01  TCGA-55-8302-01  \
sample     ...                                                                       
ARHGEF10L  ...     7.2428           8.8388           9.9341          10.1696         
HIF3A      ...     7.5416           3.5613

In [5]:
survival = pd.read_csv(survival_url, sep='\t')
print(survival.shape)
print(survival.head())

(641, 11)
      sample   _PATIENT  OS  OS.time  DSS  ...  DFI  DFI.time  PFI  PFI.time  Redaction
0  TCGA-0...  TCGA-0...   0      0.0  0.0  ...  NaN       NaN    0       0.0        NaN
1  TCGA-0...  TCGA-0...   0   1523.0  0.0  ...  NaN       NaN    0    1523.0        NaN
2  TCGA-0...  TCGA-0...   1    121.0  NaN  ...  NaN       NaN    0     121.0        NaN
3  TCGA-0...  TCGA-0...   0    607.0  0.0  ...  1.0     334.0    1     334.0        NaN
4  TCGA-0...  TCGA-0...   0    426.0  0.0  ...  NaN       NaN    1     183.0        NaN

[5 rows x 11 columns]


In [6]:
clinical = pd.read_csv(clinical_url, sep='\t')
print(clinical.shape)
print(clinical.head())

(706, 148)
    sampleID  ABSOLUTE_Ploidy  ABSOLUTE_Purity  AKT1 ALK_translocation  ...  \
0  TCGA-0...        NaN              NaN         NaN        NaN         ...   
1  TCGA-0...       3.77             0.46        none        NaN         ...   
2  TCGA-0...        NaN              NaN         NaN        NaN         ...   
3  TCGA-0...        NaN              NaN        none        NaN         ...   
4  TCGA-0...       2.04             0.48        none        NaN         ...   

  _GENOMIC_ID_TCGA_LUAD_RPPA _GENOMIC_ID_TCGA_LUAD_exp_HiSeqV2_percentile  \
0  59881f...                  e6a101...                                     
1  5b46a2...                  d1a8d8...                                     
2  3f069d...                  bba933...                                     
3        NaN                  e4177b...                                     
4  5e735c...                  7d6cf8...                                     

  _GENOMIC_ID_TCGA_LUAD_mutation_broad_gene _GENOMI

# save raw data

In [None]:

expression.to_parquet(data_dir_dst / 'expression.parquet', compression='snappy')
clinical.to_parquet(data_dir_dst / 'clinical.parquet', compression='snappy')
survival.to_parquet(data_dir_dst / 'survival.parquet', compression='snappy')