In [26]:
library(tidyverse)
library(TCGAbiolinks)
library(HDF5Array)

## Constants

In [2]:
projects <- c("TCGA-CESC", "TCGA-OV", "TCGA-UCS", "TCGA-UCEC")
# projects <- c("TCGA-CESC", "TCGA-OV")
data_root <- "../../../../../mnt/d/TCGA"
project_dirs <- sapply(
    projects, 
    (function(p) paste0(data_root, "/", p, "-", "TCGAbiolinks"))
)

## Functions

In [12]:
rna_seq_query <- function(p) {
    return(GDCquery(
        project = p,
        data.category = "Transcriptome Profiling",
        data.type = "Gene Expression Quantification",
        workflow.type = "HTSeq - Counts"
    ))
}

## Download and save data
No loops used here since API is really finicky and quits abruptly

In [17]:
proj_idx <- 1
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = data_root, files.per.chunk = 10)
data <- GDCprepare(q, directory = data_root)

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-CESC
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-CESC
Of the 309 files for download 309 already exist.
All samples have been already downloaded




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
cesc subtype information from:doi:10.1038/nature21386
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 3990


In [31]:
data
saveHDF5SummarizedExperiment(data, dir = paste0(data_root, "/", "saved_RSE_objects"), prefix = sub("-", "_", projects[proj_idx]))

class: RangedSummarizedExperiment 
dim: 56493 309 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(309): TCGA-JX-A3Q8-01A-11R-A21T-07
  TCGA-C5-A1BK-01B-11R-A13Y-07 ... TCGA-EK-A2R8-01A-21R-A18M-07
  TCGA-JW-A5VK-01A-11R-A28H-07
colData names(131): sample patient ... subtype_GEXP.APOBEC3H.164668
  subtype_patient

ERROR: Error in .check_and_delete_files(rds_path, h5_path, replace): Files "../../../../../mnt/d/TCGA/saved_RSE_objects/TCGA_CESCse.rds"
  and/or "../../../../../mnt/d/TCGA/saved_RSE_objects/TCGA_CESCassays.h5"
  already exist. Use a different 'prefix' or use 'replace=TRUE' if you
  really want to replace them.


In [32]:
proj_idx <- 2
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = data_root, files.per.chunk = 10)
data <- GDCprepare(q, directory = data_root)

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-OV
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-OV
Of the 379 files for download 379 already exist.
All samples have been already downloaded




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 3990


In [36]:
data
saveHDF5SummarizedExperiment(data, dir = paste0(data_root, "/", "saved_RSE_objects"), prefix = sub("-", "_", projects[proj_idx]))

class: RangedSummarizedExperiment 
dim: 56493 379 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(379): TCGA-5X-AA5U-01A-11R-A406-31
  TCGA-25-2400-01A-01R-1569-13 ... TCGA-13-1498-01A-01R-1565-13
  TCGA-31-1953-01A-01R-1568-13
colData names(63): sample patient ... name is_ffpe

In [41]:
proj_idx <- 3
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = data_root, files.per.chunk = 10)
data <- GDCprepare(q, directory = data_root)

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-UCS
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-UCS
Of the 56 files for download 56 already exist.
All samples have been already downloaded




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
ucs subtype information from:doi:10.1016/j.ccell.2017.02.010
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 3990


In [42]:
data
saveHDF5SummarizedExperiment(data, dir = paste0(data_root, "/", "saved_RSE_objects"), prefix = sub("-", "_", projects[proj_idx]))

class: RangedSummarizedExperiment 
dim: 56493 56 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(56): TCGA-N8-A4PO-01A-11R-A28V-07 TCGA-N5-A4RA-01A-11R-A28V-07
  ... TCGA-N9-A4Q7-01A-11R-A28V-07 TCGA-NG-A4VW-01A-11R-A28V-07
colData names(128): sample patient ... subtype_histologic.subtype
  subtype_Estimated_Carc_percent

In [43]:
proj_idx <- 4
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = data_root, files.per.chunk = 10)
data <- GDCprepare(q, directory = data_root)

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-UCEC
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-UCEC
Of the 587 files for download 587 already exist.
All samples have been already downloaded




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
ucec subtype information from:doi:10.1038/nature12113
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 3990


In [44]:
data
saveHDF5SummarizedExperiment(data, dir = paste0(data_root, "/", "saved_RSE_objects"), prefix = sub("-", "_", projects[proj_idx]))

class: RangedSummarizedExperiment 
dim: 56493 587 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(587): TCGA-AX-A06H-01A-11R-A118-07
  TCGA-A5-A0GN-01A-11R-A040-07 ... TCGA-FI-A2F4-01A-11R-A17B-07
  TCGA-AX-A3G7-01A-12R-A213-07
colData names(82): sample patient ... subtype_cna_cluster_k4
  subtype_mrna_expression_cluster