In [2]:
library(tidyverse)
library(TCGAbiolinks)
library(HDF5Array)
library(SummarizedExperiment)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: DelayedArray
Loading required package: stats4
Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    co

# Constants

In [3]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c(
    "TCGA-CESC", "TCGA-OV", "TCGA-UCS", "TCGA-UCEC", "TCGA-BRCA", "TCGA-COAD", "TCGA-LGG", "TCGA-TGCT", "TCGA-PRAD"
)
project_paths <- unlist(map(projects, function(prj) paste0(dirs$data_dir, "/", prj)))
biolinks_dir <- "tcga_biolinks_downloads"
RSE_objects_dir <- "saved_RSE_objects"
TCGA_dest_dir <- paste0(dirs$data_dir, "/", biolinks_dir)
RSE_objects_dest_dir <- paste0(dirs$data_dir, "/", RSE_objects_dir)

# Functions

In [4]:
rna_seq_query <- function(p) {
    return(GDCquery(
        project = p,
        data.category = "Transcriptome Profiling",
        data.type = "Gene Expression Quantification",
        workflow.type = "HTSeq - Counts"
    ))
}


prep_and_save_count_data <- function(rses, label_field, dest_dir, dest_subdir) {
    id_symbol_map <- as_tibble(rowData(rses[[1]]))
    
    # Does the matrix data directory exist? If not, create it
    dir.create(paste0(dest_dir, "/", dest_subdir))
    
    for (n in names(rses)) {
        counts_df <- assays(rses[[n]])[["HTSeq - Counts"]] %>%
            as_tibble(rownames = "ensembl_gene_id") %>%
            inner_join(id_symbol_map, by = "ensembl_gene_id") %>%
            dplyr::select(ensembl_gene_id, external_gene_name, everything()) %>%
            dplyr::select(-original_ensembl_gene_id)
        
        condition_labels <- rses[[n]][[label_field]]
        coldata_df <- as_tibble(colnames(counts_df)[-c(1:2)]) %>%
            dplyr::rename(sample_name = value) %>%
            mutate(condition = condition_labels) %>%
            mutate(project = n)
        
        write_tsv(counts_df, path = paste0(dest_dir, "/", dest_subdir, "/", n, "_counts.tsv"))
        write_tsv(coldata_df, path = paste0(dest_dir, "/", dest_subdir, "/", n, "_coldata.tsv"))
    }
}

# Download and save data
No loops used here since API is really finicky and quits abruptly

In [None]:
proj_idx <- 1
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

In [None]:
proj_idx <- 2
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

In [None]:
proj_idx <- 3
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

In [None]:
proj_idx <- 4
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

In [None]:
proj_idx <- 5
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

In [5]:
proj_idx <- 6
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-COAD
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-COAD
Of the 521 files for download 520 already exist.
We will download only those that are missing ones.
GDCdownload will download: 248.513 KB
Downloading chunk 1 of 1 (1 files, size = 248.513 KB) as b39f2bfa-5d4b-4bbe-9870-a93efff7bc5c.htseq.counts.gz




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
coad subtype information from:doi:10.1038/nature11252
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
“`select_()` is deprecated as of dplyr 0.7.0.
Please use `select()` instead.
“`filter_()` is deprecated as of dplyr 0.7.0.
Please use `filter()` instead.
See vignette('programming') for more help
From the 60483 genes we couldn't map 4026


class: RangedSummarizedExperiment 
dim: 56457 521 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56457): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(521): TCGA-AA-3664-01A-01R-0905-07
  TCGA-AZ-4313-01A-01R-1410-07 ... TCGA-AZ-6603-01A-11R-1839-07
  TCGA-AZ-5403-01A-01R-1653-07
colData names(112): sample patient ...
  subtype_vascular_invasion_present subtype_vital_status

In [9]:
proj_idx <- 7
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-LGG
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-LGG
GDCdownload will download 529 files. A total of 135.012244 MB
Downloading chunk 1 of 53 (10 files, size = 2.553462 MB) as Wed_Sep_16_14_14_59_2020_0.tar.gz


Downloading: 2.5 MB     

Downloading chunk 2 of 53 (10 files, size = 2.551612 MB) as Wed_Sep_16_14_14_59_2020_1.tar.gz


Downloading: 2.5 MB       

Downloading chunk 3 of 53 (10 files, size = 2.562883 MB) as Wed_Sep_16_14_14_59_2020_2.tar.gz


Downloading: 2.6 MB     

Downloading chunk 4 of 53 (10 files, size = 2.547223 MB) as Wed_Sep_16_14_14_59_2020_3.tar.gz


Downloading: 2.5 MB       

Downloading chunk 5 of 53 (10 files, size = 2.54444 MB) as Wed_Sep_16_14_14_59_2020_4.tar.gz


Downloading: 2.5 MB       

Downloading chunk 6 of 53 (10 files, size = 2.554891 MB) as Wed_Sep_16_14_14_59_2020_5.tar.gz


Downloading: 2.5 MB       

Downloading chunk 7 of 53 (10 files, size = 2.55139 MB) as Wed_Sep_16_14_14_59_2020_6.tar.gz


Downloading: 2.5 MB       

Downloading chunk 8 of 53 (10 files, size = 2.556823 MB) as Wed_Sep_16_14_14_59_2020_7.tar.gz


Downloading: 2.6 MB       

Downloading chunk 9 of 53 (10 files, size = 2.56154 MB) as Wed_Sep_16_14_14_59_2020_8.tar.gz


Downloading: 2.6 MB       

Downloading chunk 10 of 53 (10 files, size = 2.555624 MB) as Wed_Sep_16_14_14_59_2020_9.tar.gz


Downloading: 2.5 MB       

Downloading chunk 11 of 53 (10 files, size = 2.545672 MB) as Wed_Sep_16_14_14_59_2020_10.tar.gz


Downloading: 2.5 MB       

Downloading chunk 12 of 53 (10 files, size = 2.54691 MB) as Wed_Sep_16_14_14_59_2020_11.tar.gz


Downloading: 2.5 MB       

Downloading chunk 13 of 53 (10 files, size = 2.544719 MB) as Wed_Sep_16_14_14_59_2020_12.tar.gz


Downloading: 2.5 MB       

Downloading chunk 14 of 53 (10 files, size = 2.540425 MB) as Wed_Sep_16_14_14_59_2020_13.tar.gz


Downloading: 2.5 MB       

Downloading chunk 15 of 53 (10 files, size = 2.552946 MB) as Wed_Sep_16_14_14_59_2020_14.tar.gz


Downloading: 2.5 MB       

Downloading chunk 16 of 53 (10 files, size = 2.543307 MB) as Wed_Sep_16_14_14_59_2020_15.tar.gz


Downloading: 2.5 MB       

Downloading chunk 17 of 53 (10 files, size = 2.547333 MB) as Wed_Sep_16_14_14_59_2020_16.tar.gz


Downloading: 2.5 MB     

Downloading chunk 18 of 53 (10 files, size = 2.550702 MB) as Wed_Sep_16_14_14_59_2020_17.tar.gz


Downloading: 2.5 MB       

Downloading chunk 19 of 53 (10 files, size = 2.554965 MB) as Wed_Sep_16_14_14_59_2020_18.tar.gz


Downloading: 2.5 MB       

Downloading chunk 20 of 53 (10 files, size = 2.558458 MB) as Wed_Sep_16_14_14_59_2020_19.tar.gz


Downloading: 2.6 MB       

Downloading chunk 21 of 53 (10 files, size = 2.55863 MB) as Wed_Sep_16_14_14_59_2020_20.tar.gz


Downloading: 2.6 MB       

Downloading chunk 22 of 53 (10 files, size = 2.555254 MB) as Wed_Sep_16_14_14_59_2020_21.tar.gz


Downloading: 2.5 MB       

Downloading chunk 23 of 53 (10 files, size = 2.551099 MB) as Wed_Sep_16_14_14_59_2020_22.tar.gz


Downloading: 2.5 MB       

Downloading chunk 24 of 53 (10 files, size = 2.552923 MB) as Wed_Sep_16_14_14_59_2020_23.tar.gz


Downloading: 2.5 MB       

Downloading chunk 25 of 53 (10 files, size = 2.562103 MB) as Wed_Sep_16_14_14_59_2020_24.tar.gz


Downloading: 2.6 MB       

Downloading chunk 26 of 53 (10 files, size = 2.560123 MB) as Wed_Sep_16_14_14_59_2020_25.tar.gz


Downloading: 2.6 MB       

Downloading chunk 27 of 53 (10 files, size = 2.560513 MB) as Wed_Sep_16_14_14_59_2020_26.tar.gz


Downloading: 2.6 MB     

Downloading chunk 28 of 53 (10 files, size = 2.558727 MB) as Wed_Sep_16_14_14_59_2020_27.tar.gz


Downloading: 2.6 MB     

Downloading chunk 29 of 53 (10 files, size = 2.549162 MB) as Wed_Sep_16_14_14_59_2020_28.tar.gz


Downloading: 2.5 MB       

Downloading chunk 30 of 53 (10 files, size = 2.565726 MB) as Wed_Sep_16_14_14_59_2020_29.tar.gz


Downloading: 2.6 MB       

Downloading chunk 31 of 53 (10 files, size = 2.551593 MB) as Wed_Sep_16_14_14_59_2020_30.tar.gz


Downloading: 2.5 MB       

Downloading chunk 32 of 53 (10 files, size = 2.550646 MB) as Wed_Sep_16_14_14_59_2020_31.tar.gz


Downloading: 2.5 MB       

Downloading chunk 33 of 53 (10 files, size = 2.54004 MB) as Wed_Sep_16_14_14_59_2020_32.tar.gz


Downloading: 2.5 MB       

Downloading chunk 34 of 53 (10 files, size = 2.55052 MB) as Wed_Sep_16_14_14_59_2020_33.tar.gz


Downloading: 2.5 MB     

Downloading chunk 35 of 53 (10 files, size = 2.539203 MB) as Wed_Sep_16_14_14_59_2020_34.tar.gz


Downloading: 2.5 MB       

Downloading chunk 36 of 53 (10 files, size = 2.552885 MB) as Wed_Sep_16_14_14_59_2020_35.tar.gz


Downloading: 2.5 MB       

Downloading chunk 37 of 53 (10 files, size = 2.538432 MB) as Wed_Sep_16_14_14_59_2020_36.tar.gz


Downloading: 2.5 MB       

Downloading chunk 38 of 53 (10 files, size = 2.551544 MB) as Wed_Sep_16_14_14_59_2020_37.tar.gz


Downloading: 2.5 MB       

Downloading chunk 39 of 53 (10 files, size = 2.552924 MB) as Wed_Sep_16_14_14_59_2020_38.tar.gz


Downloading: 2.5 MB       

Downloading chunk 40 of 53 (10 files, size = 2.557947 MB) as Wed_Sep_16_14_14_59_2020_39.tar.gz


Downloading: 2.6 MB       

Downloading chunk 41 of 53 (10 files, size = 2.558295 MB) as Wed_Sep_16_14_14_59_2020_40.tar.gz


Downloading: 2.6 MB       

Downloading chunk 42 of 53 (10 files, size = 2.55779 MB) as Wed_Sep_16_14_14_59_2020_41.tar.gz


Downloading: 2.6 MB       

Downloading chunk 43 of 53 (10 files, size = 2.551233 MB) as Wed_Sep_16_14_14_59_2020_42.tar.gz


Downloading: 2.5 MB       

Downloading chunk 44 of 53 (10 files, size = 2.541086 MB) as Wed_Sep_16_14_14_59_2020_43.tar.gz


Downloading: 2.5 MB       

Downloading chunk 45 of 53 (10 files, size = 2.552716 MB) as Wed_Sep_16_14_14_59_2020_44.tar.gz


Downloading: 2.5 MB       

Downloading chunk 46 of 53 (10 files, size = 2.548563 MB) as Wed_Sep_16_14_14_59_2020_45.tar.gz


Downloading: 2.5 MB       

Downloading chunk 47 of 53 (10 files, size = 2.555844 MB) as Wed_Sep_16_14_14_59_2020_46.tar.gz


Downloading: 2.6 MB       

Downloading chunk 48 of 53 (10 files, size = 2.558855 MB) as Wed_Sep_16_14_14_59_2020_47.tar.gz


Downloading: 2.6 MB       

Downloading chunk 49 of 53 (10 files, size = 2.558757 MB) as Wed_Sep_16_14_14_59_2020_48.tar.gz


Downloading: 2.6 MB       

Downloading chunk 50 of 53 (10 files, size = 2.549805 MB) as Wed_Sep_16_14_14_59_2020_49.tar.gz


Downloading: 2.5 MB     

Downloading chunk 51 of 53 (10 files, size = 2.536534 MB) as Wed_Sep_16_14_14_59_2020_50.tar.gz


Downloading: 2.5 MB       

Downloading chunk 52 of 53 (10 files, size = 2.561019 MB) as Wed_Sep_16_14_14_59_2020_51.tar.gz


Downloading: 2.6 MB       

Downloading chunk 53 of 53 (9 files, size = 2.296428 MB) as Wed_Sep_16_14_14_59_2020_52.tar.gz




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
lgg subtype information from:doi:10.1016/j.cell.2015.12.028
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 4026


class: RangedSummarizedExperiment 
dim: 56457 529 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56457): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(529): TCGA-R8-A73M-01A-11R-A32Q-07
  TCGA-HT-7471-01A-11R-2256-07 ... TCGA-QH-A6X3-01A-21R-A32Q-07
  TCGA-P5-A5F0-01A-11R-A28M-07
colData names(113): sample patient ...
  subtype_Telomere.length.estimate.in.blood.normal..Kb.
  subtype_Telomere.length.estimate.in.tumor..Kb.

In [5]:
proj_idx <- 8
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-TGCT
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-TGCT
GDCdownload will download 156 files. A total of 39.5905 MB
Downloading chunk 1 of 16 (10 files, size = 2.537164 MB) as Thu_Sep_17_15_21_06_2020_0.tar.gz


Downloading: 2.5 MB     

Downloading chunk 2 of 16 (10 files, size = 2.537302 MB) as Thu_Sep_17_15_21_06_2020_1.tar.gz


Downloading: 2.5 MB       

Downloading chunk 3 of 16 (10 files, size = 2.536912 MB) as Thu_Sep_17_15_21_06_2020_2.tar.gz


Downloading: 2.5 MB       

Downloading chunk 4 of 16 (10 files, size = 2.547079 MB) as Thu_Sep_17_15_21_06_2020_3.tar.gz


Downloading: 2.5 MB       

Downloading chunk 5 of 16 (10 files, size = 2.536 MB) as Thu_Sep_17_15_21_06_2020_4.tar.gz


Downloading: 2.5 MB       

Downloading chunk 6 of 16 (10 files, size = 2.544117 MB) as Thu_Sep_17_15_21_06_2020_5.tar.gz


Downloading: 2.5 MB       

Downloading chunk 7 of 16 (10 files, size = 2.534699 MB) as Thu_Sep_17_15_21_06_2020_6.tar.gz


Downloading: 2.5 MB       

Downloading chunk 8 of 16 (10 files, size = 2.548464 MB) as Thu_Sep_17_15_21_06_2020_7.tar.gz


Downloading: 2.5 MB       

Downloading chunk 9 of 16 (10 files, size = 2.52637 MB) as Thu_Sep_17_15_21_06_2020_8.tar.gz


Downloading: 2.5 MB       

Downloading chunk 10 of 16 (10 files, size = 2.538284 MB) as Thu_Sep_17_15_21_06_2020_9.tar.gz


Downloading: 2.5 MB       

Downloading chunk 11 of 16 (10 files, size = 2.53732 MB) as Thu_Sep_17_15_21_06_2020_10.tar.gz


Downloading: 2.5 MB       

Downloading chunk 12 of 16 (10 files, size = 2.546517 MB) as Thu_Sep_17_15_21_06_2020_11.tar.gz


Downloading: 2.5 MB       

Downloading chunk 13 of 16 (10 files, size = 2.523885 MB) as Thu_Sep_17_15_21_06_2020_12.tar.gz


Downloading: 2.5 MB       

Downloading chunk 14 of 16 (10 files, size = 2.543499 MB) as Thu_Sep_17_15_21_06_2020_13.tar.gz


Downloading: 2.5 MB     

Downloading chunk 15 of 16 (10 files, size = 2.534547 MB) as Thu_Sep_17_15_21_06_2020_14.tar.gz


Downloading: 2.5 MB       

Downloading chunk 16 of 16 (6 files, size = 1.518341 MB) as Thu_Sep_17_15_21_06_2020_15.tar.gz




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
“`select_()` is deprecated as of dplyr 0.7.0.
Please use `select()` instead.
“`filter_()` is deprecated as of dplyr 0.7.0.
Please use `filter()` instead.
See vignette('programming') for more help
From the 60483 genes we couldn't map 4026


class: RangedSummarizedExperiment 
dim: 56457 156 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56457): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(156): TCGA-2G-AAFZ-01A-11R-A430-07
  TCGA-2G-AAF4-01A-11R-A430-07 ... TCGA-XY-A9T9-01A-11R-A431-07
  TCGA-2G-AAL5-01A-11R-A430-07
colData names(72): sample patient ... days_to_death is_ffpe

In [7]:
proj_idx <- 9
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-PRAD
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-PRAD
Of the 551 files for download 550 already exist.
We will download only those that are missing ones.
GDCdownload will download: 252.892 KB
Downloading chunk 1 of 1 (1 files, size = 252.892 KB) as ba57cbaf-b52c-402b-b72a-f0df0e086369.htseq.counts.gz




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
prad subtype information from:doi:10.1016/j.cell.2015.10.025
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 4026


class: RangedSummarizedExperiment 
dim: 56457 551 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56457): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(551): TCGA-HC-A6AN-01A-11R-A30B-07
  TCGA-G9-6498-01A-12R-A311-07 ... TCGA-CH-5772-01A-11R-1580-07
  TCGA-ZG-A9L2-01A-31R-A41O-07
colData names(152): sample patient ... subtype_AR_V7_ratio
  subtype_AR_V7_presence

# Save data in count matrix + coldata format

Save data in a format ready for DESeq/clustering/etc.

In [8]:
data_ls <- load_RSE_objects(RSE_objects_dest_dir, projects, paste0(projects, "_RNA_"))

In [9]:
prep_and_save_count_data(
    rses = data_ls,
    label_field = "definition",
    dest_dir = dirs$data_dir,
    dest_subdir = "TCGA_RNA_matrix_count_data"
)

“'/mnt/d/fogg_lab_gyn_cancer_data/TCGA_RNA_matrix_count_data' already exists”