In [27]:
library(tidyverse)
library(TCGAbiolinks)
library(HDF5Array)
library(SummarizedExperiment)

# Custom package
library(rutils)

Loading required package: GenomicRanges
Loading required package: GenomeInfoDb
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Attaching package: ‘Biobase’

The following objects are masked from ‘package:matrixStats’:

    anyMissing, rowMedians



# Constants

In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-OV", "TCGA-UCS", "TCGA-UCEC")
project_paths <- unlist(map(projects, function(prj) paste0(dirs$data_dir, "/", prj)))
biolinks_dir <- "tcga_biolinks_downloads"
RSE_objects_dir <- "saved_RSE_objects"
TCGA_dest_dir <- paste0(dirs$data_dir, "/", biolinks_dir)
RSE_objects_dest_dir <- paste0(dirs$data_dir, "/", RSE_objects_dir)

# Functions

In [32]:
rna_seq_query <- function(p) {
    return(GDCquery(
        project = p,
        data.category = "Transcriptome Profiling",
        data.type = "Gene Expression Quantification",
        workflow.type = "HTSeq - Counts"
    ))
}


load_RSE_objects <- function(dir, projects, prefixes) {
    data_ls <- list()
    for (i in seq_len(length(projects))) {
        data_ls[[projects[i]]] <- loadHDF5SummarizedExperiment(dir = dir, prefix = prefixes[i])
    }
    return(data_ls)
}


prep_and_save_count_data <- function(rses, label_field, dest_dir, dest_subdir) {
    id_symbol_map <- as_tibble(rowData(rses[[1]]))
    
    # Does the matrix data directory exist? If not, create it
    dir.create(paste0(dest_dir, "/", dest_subdir))
    
    for (n in names(rses)) {
        counts_df <- assays(rses[[n]])[["HTSeq - Counts"]] %>%
            as_tibble(rownames = "ensembl_gene_id") %>%
            inner_join(id_symbol_map, by = "ensembl_gene_id") %>%
            dplyr::select(ensembl_gene_id, external_gene_name, everything()) %>%
            dplyr::select(-original_ensembl_gene_id)
        
        condition_labels <- rses[[n]][[label_field]]
        coldata_df <- as_tibble(colnames(counts_df)[-c(1:2)]) %>%
            dplyr::rename(sample_name = value) %>%
            mutate(condition = condition_labels) %>%
            mutate(project = n)
        
        write_tsv(counts_df, path = paste0(dest_dir, "/", dest_subdir, "/", n, "_counts.tsv"))
        write_tsv(coldata_df, path = paste0(dest_dir, "/", dest_subdir, "/", n, "_coldata.tsv"))
    }
}

# Download and save data
No loops used here since API is really finicky and quits abruptly

In [10]:
proj_idx <- 1
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-CESC
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-CESC
GDCdownload will download 309 files. A total of 77.948874 MB
Downloading chunk 1 of 31 (10 files, size = 2.525687 MB) as Tue_Aug_18_16_52_15_2020_0.tar.gz


Downloading: 2.5 MB       

Downloading chunk 2 of 31 (10 files, size = 2.527227 MB) as Tue_Aug_18_16_52_15_2020_1.tar.gz


Downloading: 2.5 MB       

Downloading chunk 3 of 31 (10 files, size = 2.516641 MB) as Tue_Aug_18_16_52_15_2020_2.tar.gz


Downloading: 2.5 MB       

Downloading chunk 4 of 31 (10 files, size = 2.510196 MB) as Tue_Aug_18_16_52_15_2020_3.tar.gz


Downloading: 2.5 MB       

Downloading chunk 5 of 31 (10 files, size = 2.515677 MB) as Tue_Aug_18_16_52_15_2020_4.tar.gz


Downloading: 2.5 MB       

Downloading chunk 6 of 31 (10 files, size = 2.514275 MB) as Tue_Aug_18_16_52_15_2020_5.tar.gz


Downloading: 2.5 MB       

Downloading chunk 7 of 31 (10 files, size = 2.524145 MB) as Tue_Aug_18_16_52_15_2020_6.tar.gz


Downloading: 2.5 MB       

Downloading chunk 8 of 31 (10 files, size = 2.537913 MB) as Tue_Aug_18_16_52_15_2020_7.tar.gz


Downloading: 2.5 MB     

Downloading chunk 9 of 31 (10 files, size = 2.51407 MB) as Tue_Aug_18_16_52_15_2020_8.tar.gz


Downloading: 2.5 MB       

Downloading chunk 10 of 31 (10 files, size = 2.518752 MB) as Tue_Aug_18_16_52_15_2020_9.tar.gz


Downloading: 2.5 MB       

Downloading chunk 11 of 31 (10 files, size = 2.521472 MB) as Tue_Aug_18_16_52_15_2020_10.tar.gz


Downloading: 2.5 MB       

Downloading chunk 12 of 31 (10 files, size = 2.525212 MB) as Tue_Aug_18_16_52_15_2020_11.tar.gz


Downloading: 2.5 MB       

Downloading chunk 13 of 31 (10 files, size = 2.52909 MB) as Tue_Aug_18_16_52_15_2020_12.tar.gz


Downloading: 2.5 MB     

Downloading chunk 14 of 31 (10 files, size = 2.512235 MB) as Tue_Aug_18_16_52_15_2020_13.tar.gz


Downloading: 2.5 MB       

Downloading chunk 15 of 31 (10 files, size = 2.526529 MB) as Tue_Aug_18_16_52_15_2020_14.tar.gz


Downloading: 2.5 MB       

Downloading chunk 16 of 31 (10 files, size = 2.523593 MB) as Tue_Aug_18_16_52_15_2020_15.tar.gz


Downloading: 2.5 MB       

Downloading chunk 17 of 31 (10 files, size = 2.526281 MB) as Tue_Aug_18_16_52_15_2020_16.tar.gz


Downloading: 2.5 MB       

Downloading chunk 18 of 31 (10 files, size = 2.531817 MB) as Tue_Aug_18_16_52_15_2020_17.tar.gz


Downloading: 2.5 MB       

Downloading chunk 19 of 31 (10 files, size = 2.503543 MB) as Tue_Aug_18_16_52_15_2020_18.tar.gz


Downloading: 2.5 MB       

Downloading chunk 20 of 31 (10 files, size = 2.518505 MB) as Tue_Aug_18_16_52_15_2020_19.tar.gz


Downloading: 2.5 MB       

Downloading chunk 21 of 31 (10 files, size = 2.513516 MB) as Tue_Aug_18_16_52_15_2020_20.tar.gz


Downloading: 2.5 MB     

Downloading chunk 22 of 31 (10 files, size = 2.531634 MB) as Tue_Aug_18_16_52_15_2020_21.tar.gz


Downloading: 2.5 MB       

Downloading chunk 23 of 31 (10 files, size = 2.526956 MB) as Tue_Aug_18_16_52_15_2020_22.tar.gz


Downloading: 2.5 MB       

Downloading chunk 24 of 31 (10 files, size = 2.515253 MB) as Tue_Aug_18_16_52_15_2020_23.tar.gz


Downloading: 2.5 MB       

Downloading chunk 25 of 31 (10 files, size = 2.526321 MB) as Tue_Aug_18_16_52_15_2020_24.tar.gz


Downloading: 2.5 MB       

Downloading chunk 26 of 31 (10 files, size = 2.523639 MB) as Tue_Aug_18_16_52_15_2020_25.tar.gz


Downloading: 2.5 MB       

Downloading chunk 27 of 31 (10 files, size = 2.530583 MB) as Tue_Aug_18_16_52_15_2020_26.tar.gz


Downloading: 2.5 MB       

Downloading chunk 28 of 31 (10 files, size = 2.545826 MB) as Tue_Aug_18_16_52_15_2020_27.tar.gz


Downloading: 2.5 MB       

Downloading chunk 29 of 31 (10 files, size = 2.527371 MB) as Tue_Aug_18_16_52_15_2020_28.tar.gz


Downloading: 2.5 MB       

Downloading chunk 30 of 31 (10 files, size = 2.51555 MB) as Tue_Aug_18_16_52_15_2020_29.tar.gz


Downloading: 2.5 MB       

Downloading chunk 31 of 31 (9 files, size = 2.269365 MB) as Tue_Aug_18_16_52_15_2020_30.tar.gz




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
cesc subtype information from:doi:10.1038/nature21386
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
“`select_()` is deprecated as of dplyr 0.7.0.
Please use `select()` instead.
“`filter_()` is deprecated as of dplyr 0.7.0.
Please use `filter()` instead.
See vignette('programming') for more help
From the 60483 genes we couldn't map 3990


class: RangedSummarizedExperiment 
dim: 56493 309 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(309): TCGA-MY-A5BF-01A-11R-A26T-07
  TCGA-EA-A3HU-01A-11R-A213-07 ... TCGA-EK-A2R8-01A-21R-A18M-07
  TCGA-JW-A5VK-01A-11R-A28H-07
colData names(131): sample patient ... subtype_GEXP.APOBEC3H.164668
  subtype_patient

In [11]:
proj_idx <- 2
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-OV
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-OV
GDCdownload will download 379 files. A total of 97.709866 MB
Downloading chunk 1 of 38 (10 files, size = 2.575505 MB) as Tue_Aug_18_17_32_22_2020_0.tar.gz


Downloading: 2.6 MB     

Downloading chunk 2 of 38 (10 files, size = 2.58518 MB) as Tue_Aug_18_17_32_22_2020_1.tar.gz


Downloading: 2.6 MB       

Downloading chunk 3 of 38 (10 files, size = 2.546433 MB) as Tue_Aug_18_17_32_22_2020_2.tar.gz


Downloading: 2.5 MB     

Downloading chunk 4 of 38 (10 files, size = 2.545048 MB) as Tue_Aug_18_17_32_22_2020_3.tar.gz


Downloading: 2.5 MB       

Downloading chunk 5 of 38 (10 files, size = 2.598134 MB) as Tue_Aug_18_17_32_22_2020_4.tar.gz


Downloading: 2.6 MB     

Downloading chunk 6 of 38 (10 files, size = 2.586813 MB) as Tue_Aug_18_17_32_22_2020_5.tar.gz


Downloading: 2.6 MB     

Downloading chunk 7 of 38 (10 files, size = 2.572175 MB) as Tue_Aug_18_17_32_22_2020_6.tar.gz


Downloading: 2.6 MB       

Downloading chunk 8 of 38 (10 files, size = 2.571343 MB) as Tue_Aug_18_17_32_22_2020_7.tar.gz


Downloading: 2.6 MB       

Downloading chunk 9 of 38 (10 files, size = 2.58495 MB) as Tue_Aug_18_17_32_22_2020_8.tar.gz


Downloading: 2.6 MB     

Downloading chunk 10 of 38 (10 files, size = 2.560346 MB) as Tue_Aug_18_17_32_22_2020_9.tar.gz


Downloading: 2.6 MB       

Downloading chunk 11 of 38 (10 files, size = 2.585419 MB) as Tue_Aug_18_17_32_22_2020_10.tar.gz


Downloading: 2.6 MB       

Downloading chunk 12 of 38 (10 files, size = 2.573362 MB) as Tue_Aug_18_17_32_22_2020_11.tar.gz


Downloading: 2.6 MB     

Downloading chunk 13 of 38 (10 files, size = 2.572696 MB) as Tue_Aug_18_17_32_22_2020_12.tar.gz


Downloading: 2.6 MB     

Downloading chunk 14 of 38 (10 files, size = 2.560171 MB) as Tue_Aug_18_17_32_22_2020_13.tar.gz


Downloading: 2.6 MB     

Downloading chunk 15 of 38 (10 files, size = 2.593542 MB) as Tue_Aug_18_17_32_22_2020_14.tar.gz


Downloading: 2.6 MB     

Downloading chunk 16 of 38 (10 files, size = 2.558508 MB) as Tue_Aug_18_17_32_22_2020_15.tar.gz


Downloading: 2.6 MB       

Downloading chunk 17 of 38 (10 files, size = 2.598926 MB) as Tue_Aug_18_17_32_22_2020_16.tar.gz


Downloading: 2.6 MB     

Downloading chunk 18 of 38 (10 files, size = 2.568633 MB) as Tue_Aug_18_17_32_22_2020_17.tar.gz


Downloading: 2.6 MB       

Downloading chunk 19 of 38 (10 files, size = 2.57442 MB) as Tue_Aug_18_17_32_22_2020_18.tar.gz


Downloading: 2.6 MB       

Downloading chunk 20 of 38 (10 files, size = 2.565395 MB) as Tue_Aug_18_17_32_22_2020_19.tar.gz


Downloading: 2.6 MB       

Downloading chunk 21 of 38 (10 files, size = 2.575067 MB) as Tue_Aug_18_17_32_22_2020_20.tar.gz


Downloading: 2.6 MB     

Downloading chunk 22 of 38 (10 files, size = 2.582597 MB) as Tue_Aug_18_17_32_22_2020_21.tar.gz


Downloading: 2.6 MB       

Downloading chunk 23 of 38 (10 files, size = 2.5891 MB) as Tue_Aug_18_17_32_22_2020_22.tar.gz


Downloading: 2.6 MB       

Downloading chunk 24 of 38 (10 files, size = 2.577738 MB) as Tue_Aug_18_17_32_22_2020_23.tar.gz


Downloading: 2.6 MB       

Downloading chunk 25 of 38 (10 files, size = 2.583447 MB) as Tue_Aug_18_17_32_22_2020_24.tar.gz


Downloading: 2.6 MB     

Downloading chunk 26 of 38 (10 files, size = 2.593591 MB) as Tue_Aug_18_17_32_22_2020_25.tar.gz


Downloading: 2.6 MB     

Downloading chunk 27 of 38 (10 files, size = 2.609177 MB) as Tue_Aug_18_17_32_22_2020_26.tar.gz


Downloading: 2.6 MB       

Downloading chunk 28 of 38 (10 files, size = 2.565686 MB) as Tue_Aug_18_17_32_22_2020_27.tar.gz


Downloading: 2.6 MB       

Downloading chunk 29 of 38 (10 files, size = 2.577027 MB) as Tue_Aug_18_17_32_22_2020_28.tar.gz


Downloading: 2.6 MB       

Downloading chunk 30 of 38 (10 files, size = 2.581507 MB) as Tue_Aug_18_17_32_22_2020_29.tar.gz


Downloading: 2.6 MB       

Downloading chunk 31 of 38 (10 files, size = 2.612201 MB) as Tue_Aug_18_17_32_22_2020_30.tar.gz


Downloading: 2.6 MB       

Downloading chunk 32 of 38 (10 files, size = 2.565794 MB) as Tue_Aug_18_17_32_22_2020_31.tar.gz


Downloading: 2.6 MB     

Downloading chunk 33 of 38 (10 files, size = 2.5781 MB) as Tue_Aug_18_17_32_22_2020_32.tar.gz


Downloading: 2.6 MB       

Downloading chunk 34 of 38 (10 files, size = 2.546885 MB) as Tue_Aug_18_17_32_22_2020_33.tar.gz


Downloading: 2.5 MB     

Downloading chunk 35 of 38 (10 files, size = 2.573288 MB) as Tue_Aug_18_17_32_22_2020_34.tar.gz


Downloading: 2.6 MB     

Downloading chunk 36 of 38 (10 files, size = 2.590288 MB) as Tue_Aug_18_17_32_22_2020_35.tar.gz


Downloading: 2.6 MB     

Downloading chunk 37 of 38 (10 files, size = 2.600758 MB) as Tue_Aug_18_17_32_22_2020_36.tar.gz


Downloading: 2.6 MB     

Downloading chunk 38 of 38 (9 files, size = 2.330616 MB) as Tue_Aug_18_17_32_22_2020_37.tar.gz




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 3990


class: RangedSummarizedExperiment 
dim: 56493 379 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(379): TCGA-04-1338-01A-01R-1564-13
  TCGA-13-0797-01A-01R-1564-13 ... TCGA-13-1498-01A-01R-1565-13
  TCGA-31-1953-01A-01R-1568-13
colData names(63): sample patient ... releasable is_ffpe

In [12]:
proj_idx <- 3
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-UCS
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-UCS
GDCdownload will download 56 files. A total of 14.221611 MB
Downloading chunk 1 of 6 (10 files, size = 2.536693 MB) as Tue_Aug_18_17_38_02_2020_0.tar.gz


Downloading: 2.5 MB     

Downloading chunk 2 of 6 (10 files, size = 2.543023 MB) as Tue_Aug_18_17_38_02_2020_1.tar.gz


Downloading: 2.5 MB       

Downloading chunk 3 of 6 (10 files, size = 2.556204 MB) as Tue_Aug_18_17_38_02_2020_2.tar.gz


Downloading: 2.5 MB       

Downloading chunk 4 of 6 (10 files, size = 2.528308 MB) as Tue_Aug_18_17_38_02_2020_3.tar.gz


Downloading: 2.5 MB       

Downloading chunk 5 of 6 (10 files, size = 2.537284 MB) as Tue_Aug_18_17_38_02_2020_4.tar.gz


Downloading: 2.5 MB     

Downloading chunk 6 of 6 (6 files, size = 1.520099 MB) as Tue_Aug_18_17_38_02_2020_5.tar.gz




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
ucs subtype information from:doi:10.1016/j.ccell.2017.02.010
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 3990


class: RangedSummarizedExperiment 
dim: 56493 56 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(56): TCGA-N5-A4RD-01A-11R-A28V-07 TCGA-NA-A5I1-01A-21R-A28V-07
  ... TCGA-N9-A4Q7-01A-11R-A28V-07 TCGA-NG-A4VW-01A-11R-A28V-07
colData names(128): sample patient ... subtype_histologic.subtype
  subtype_Estimated_Carc_percent

In [13]:
proj_idx <- 4
q <- rna_seq_query(projects[proj_idx])
GDCdownload(q, method = "api", directory = TCGA_dest_dir, files.per.chunk = 10)
data <- GDCprepare(q, directory = TCGA_dest_dir)
data
saveHDF5SummarizedExperiment(data, dir = RSE_objects_dest_dir, prefix = paste0(projects[proj_idx], "_RNA_"))

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-UCEC
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-UCEC
GDCdownload will download 587 files. A total of 145.520507 MB
Downloading chunk 1 of 59 (10 files, size = 2.489185 MB) as Tue_Aug_18_17_39_25_2020_0.tar.gz


Downloading: 2.5 MB       

Downloading chunk 2 of 59 (10 files, size = 2.488039 MB) as Tue_Aug_18_17_39_25_2020_1.tar.gz


Downloading: 2.5 MB       

Downloading chunk 3 of 59 (10 files, size = 2.491505 MB) as Tue_Aug_18_17_39_25_2020_2.tar.gz


Downloading: 2.5 MB       

Downloading chunk 4 of 59 (10 files, size = 2.494092 MB) as Tue_Aug_18_17_39_25_2020_3.tar.gz


Downloading: 2.5 MB       

Downloading chunk 5 of 59 (10 files, size = 2.49011 MB) as Tue_Aug_18_17_39_25_2020_4.tar.gz


Downloading: 2.5 MB       

Downloading chunk 6 of 59 (10 files, size = 2.482287 MB) as Tue_Aug_18_17_39_25_2020_5.tar.gz


Downloading: 2.5 MB       

Downloading chunk 7 of 59 (10 files, size = 2.465764 MB) as Tue_Aug_18_17_39_25_2020_6.tar.gz


Downloading: 2.5 MB       

Downloading chunk 8 of 59 (10 files, size = 2.490375 MB) as Tue_Aug_18_17_39_25_2020_7.tar.gz


Downloading: 2.5 MB       

Downloading chunk 9 of 59 (10 files, size = 2.457023 MB) as Tue_Aug_18_17_39_25_2020_8.tar.gz


Downloading: 2.4 MB       

Downloading chunk 10 of 59 (10 files, size = 2.466789 MB) as Tue_Aug_18_17_39_25_2020_9.tar.gz


Downloading: 2.5 MB       

Downloading chunk 11 of 59 (10 files, size = 2.470066 MB) as Tue_Aug_18_17_39_25_2020_10.tar.gz


Downloading: 2.5 MB       

Downloading chunk 12 of 59 (10 files, size = 2.514454 MB) as Tue_Aug_18_17_39_25_2020_11.tar.gz


Downloading: 2.5 MB       

Downloading chunk 13 of 59 (10 files, size = 2.432327 MB) as Tue_Aug_18_17_39_25_2020_12.tar.gz


Downloading: 2.4 MB       

Downloading chunk 14 of 59 (10 files, size = 2.476156 MB) as Tue_Aug_18_17_39_25_2020_13.tar.gz


Downloading: 2.5 MB       

Downloading chunk 15 of 59 (10 files, size = 2.506444 MB) as Tue_Aug_18_17_39_25_2020_14.tar.gz


Downloading: 2.5 MB       

Downloading chunk 16 of 59 (10 files, size = 2.473501 MB) as Tue_Aug_18_17_39_25_2020_15.tar.gz


Downloading: 2.5 MB       

Downloading chunk 17 of 59 (10 files, size = 2.471143 MB) as Tue_Aug_18_17_39_25_2020_16.tar.gz


Downloading: 2.5 MB       

Downloading chunk 18 of 59 (10 files, size = 2.463949 MB) as Tue_Aug_18_17_39_25_2020_17.tar.gz


Downloading: 2.5 MB       

Downloading chunk 19 of 59 (10 files, size = 2.478317 MB) as Tue_Aug_18_17_39_25_2020_18.tar.gz


Downloading: 2.5 MB       

Downloading chunk 20 of 59 (10 files, size = 2.47264 MB) as Tue_Aug_18_17_39_25_2020_19.tar.gz


Downloading: 2.5 MB       

Downloading chunk 21 of 59 (10 files, size = 2.48047 MB) as Tue_Aug_18_17_39_25_2020_20.tar.gz


Downloading: 2.5 MB       

Downloading chunk 22 of 59 (10 files, size = 2.484934 MB) as Tue_Aug_18_17_39_25_2020_21.tar.gz


Downloading: 2.5 MB       

Downloading chunk 23 of 59 (10 files, size = 2.481727 MB) as Tue_Aug_18_17_39_25_2020_22.tar.gz


Downloading: 2.5 MB       

Downloading chunk 24 of 59 (10 files, size = 2.471209 MB) as Tue_Aug_18_17_39_25_2020_23.tar.gz


Downloading: 2.5 MB       

Downloading chunk 25 of 59 (10 files, size = 2.502933 MB) as Tue_Aug_18_17_39_25_2020_24.tar.gz


Downloading: 2.5 MB       

Downloading chunk 26 of 59 (10 files, size = 2.4838 MB) as Tue_Aug_18_17_39_25_2020_25.tar.gz


Downloading: 2.5 MB       

Downloading chunk 27 of 59 (10 files, size = 2.504043 MB) as Tue_Aug_18_17_39_25_2020_26.tar.gz


Downloading: 2.5 MB       

Downloading chunk 28 of 59 (10 files, size = 2.455745 MB) as Tue_Aug_18_17_39_25_2020_27.tar.gz


Downloading: 2.4 MB       

Downloading chunk 29 of 59 (10 files, size = 2.482125 MB) as Tue_Aug_18_17_39_25_2020_28.tar.gz


Downloading: 2.5 MB       

Downloading chunk 30 of 59 (10 files, size = 2.502247 MB) as Tue_Aug_18_17_39_25_2020_29.tar.gz


Downloading: 2.5 MB       

Downloading chunk 31 of 59 (10 files, size = 2.453237 MB) as Tue_Aug_18_17_39_25_2020_30.tar.gz


Downloading: 2.4 MB       

Downloading chunk 32 of 59 (10 files, size = 2.479071 MB) as Tue_Aug_18_17_39_25_2020_31.tar.gz


Downloading: 2.5 MB       

Downloading chunk 33 of 59 (10 files, size = 2.488107 MB) as Tue_Aug_18_17_39_25_2020_32.tar.gz


Downloading: 2.5 MB       

Downloading chunk 34 of 59 (10 files, size = 2.484246 MB) as Tue_Aug_18_17_39_25_2020_33.tar.gz


Downloading: 2.5 MB       

Downloading chunk 35 of 59 (10 files, size = 2.462607 MB) as Tue_Aug_18_17_39_25_2020_34.tar.gz


Downloading: 2.4 MB       

Downloading chunk 36 of 59 (10 files, size = 2.498881 MB) as Tue_Aug_18_17_39_25_2020_35.tar.gz


Downloading: 2.5 MB       

Downloading chunk 37 of 59 (10 files, size = 2.495367 MB) as Tue_Aug_18_17_39_25_2020_36.tar.gz


Downloading: 2.5 MB       

Downloading chunk 38 of 59 (10 files, size = 2.481796 MB) as Tue_Aug_18_17_39_25_2020_37.tar.gz


Downloading: 2.5 MB     

Downloading chunk 39 of 59 (10 files, size = 2.456723 MB) as Tue_Aug_18_17_39_25_2020_38.tar.gz


Downloading: 2.4 MB       

Downloading chunk 40 of 59 (10 files, size = 2.486268 MB) as Tue_Aug_18_17_39_25_2020_39.tar.gz


Downloading: 2.5 MB     

Downloading chunk 41 of 59 (10 files, size = 2.479749 MB) as Tue_Aug_18_17_39_25_2020_40.tar.gz


Downloading: 2.5 MB       

Downloading chunk 42 of 59 (10 files, size = 2.464356 MB) as Tue_Aug_18_17_39_25_2020_41.tar.gz


Downloading: 2.5 MB       

Downloading chunk 43 of 59 (10 files, size = 2.471826 MB) as Tue_Aug_18_17_39_25_2020_42.tar.gz


Downloading: 2.5 MB       

Downloading chunk 44 of 59 (10 files, size = 2.472143 MB) as Tue_Aug_18_17_39_25_2020_43.tar.gz


Downloading: 2.5 MB       

Downloading chunk 45 of 59 (10 files, size = 2.478434 MB) as Tue_Aug_18_17_39_25_2020_44.tar.gz


Downloading: 2.5 MB       

Downloading chunk 46 of 59 (10 files, size = 2.462342 MB) as Tue_Aug_18_17_39_25_2020_45.tar.gz


Downloading: 2.4 MB       

Downloading chunk 47 of 59 (10 files, size = 2.461523 MB) as Tue_Aug_18_17_39_25_2020_46.tar.gz


Downloading: 2.4 MB       

Downloading chunk 48 of 59 (10 files, size = 2.49576 MB) as Tue_Aug_18_17_39_25_2020_47.tar.gz


Downloading: 2.5 MB       

Downloading chunk 49 of 59 (10 files, size = 2.471159 MB) as Tue_Aug_18_17_39_25_2020_48.tar.gz


Downloading: 2.5 MB     

Downloading chunk 50 of 59 (10 files, size = 2.490721 MB) as Tue_Aug_18_17_39_25_2020_49.tar.gz


Downloading: 2.5 MB       

Downloading chunk 51 of 59 (10 files, size = 2.464443 MB) as Tue_Aug_18_17_39_25_2020_50.tar.gz


Downloading: 2.5 MB       

Downloading chunk 52 of 59 (10 files, size = 2.485916 MB) as Tue_Aug_18_17_39_25_2020_51.tar.gz


Downloading: 2.5 MB       

Downloading chunk 53 of 59 (10 files, size = 2.473838 MB) as Tue_Aug_18_17_39_25_2020_52.tar.gz


Downloading: 2.5 MB       

Downloading chunk 54 of 59 (10 files, size = 2.486046 MB) as Tue_Aug_18_17_39_25_2020_53.tar.gz


Downloading: 2.5 MB       

Downloading chunk 55 of 59 (10 files, size = 2.466714 MB) as Tue_Aug_18_17_39_25_2020_54.tar.gz


Downloading: 2.5 MB       

Downloading chunk 56 of 59 (10 files, size = 2.494336 MB) as Tue_Aug_18_17_39_25_2020_55.tar.gz


Downloading: 2.5 MB       

Downloading chunk 57 of 59 (10 files, size = 2.487951 MB) as Tue_Aug_18_17_39_25_2020_56.tar.gz


Downloading: 2.5 MB       

Downloading chunk 58 of 59 (10 files, size = 2.477349 MB) as Tue_Aug_18_17_39_25_2020_57.tar.gz


Downloading: 2.5 MB       

Downloading chunk 59 of 59 (7 files, size = 1.726199 MB) as Tue_Aug_18_17_39_25_2020_58.tar.gz




Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
ucec subtype information from:doi:10.1038/nature12113
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 3990


class: RangedSummarizedExperiment 
dim: 56493 587 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(587): TCGA-B5-A1MV-01A-31R-A14D-07
  TCGA-D1-A174-01A-11R-A12I-07 ... TCGA-FI-A2F4-01A-11R-A17B-07
  TCGA-AX-A3G7-01A-12R-A213-07
colData names(82): sample patient ... subtype_cna_cluster_k4
  subtype_mrna_expression_cluster

# Save data in count matrix + coldata format

Save data in a format ready for DESeq/clustering/etc.

In [22]:
data_ls <- load_RSE_objects(RSE_objects_dest_dir, projects, paste0(projects, "_RNA_"))

In [34]:
prep_and_save_count_data(
    rses = data_ls,
    label_field = "definition",
    dest_dir = dirs$data_dir,
    dest_subdir = "TCGA_RNA_matrix_count_data"
)