In [1]:
library(tidyverse)
library(TCGAbiolinks)
library(DESeq2)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects

## Constants

In [75]:
projects <- c("TCGA-UCEC", "TCGA-CESC")
data_root <- "../../../../../mnt/d/TCGA"
project_dirs <- sapply(
    projects, 
    (function(p) paste0(data_root, "/", as.character(Sys.Date()), "-", p, "-", "TCGAbiolinks"))
)
tumor_levels <- c("Primary solid Tumor", "Metastatic")
# tumor_levels <- c("Primary solid Tumor")
healthy_levels <- c("Solid Tissue Normal")
tumor_def <- "Tumor"
healthy_def <- "Healthy"
proj_idx <- 2

## Run Query

In [3]:
query <- GDCquery(
    project = projects[proj_idx],
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    workflow.type = "HTSeq - Counts"
)

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-CESC
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------


## Download the data found by the query

In [4]:
GDCdownload(query, method = "api", directory = project_dirs[proj_idx], files.per.chunk = 10)

Downloading data for project TCGA-CESC
GDCdownload will download 309 files. A total of 77.948874 MB
Downloading chunk 1 of 31 (10 files, size = 2.5169 MB) as Thu_Jul_16_12_14_40_2020_0.tar.gz


Downloading: 2.5 MB     

Downloading chunk 2 of 31 (10 files, size = 2.509254 MB) as Thu_Jul_16_12_14_40_2020_1.tar.gz


Downloading: 2.5 MB       

Downloading chunk 3 of 31 (10 files, size = 2.515677 MB) as Thu_Jul_16_12_14_40_2020_2.tar.gz


Downloading: 2.5 MB     

Downloading chunk 4 of 31 (10 files, size = 2.518157 MB) as Thu_Jul_16_12_14_40_2020_3.tar.gz


Downloading: 2.5 MB       

Downloading chunk 5 of 31 (10 files, size = 2.511126 MB) as Thu_Jul_16_12_14_40_2020_4.tar.gz


Downloading: 2.5 MB       

Downloading chunk 6 of 31 (10 files, size = 2.524145 MB) as Thu_Jul_16_12_14_40_2020_5.tar.gz


Downloading: 2.5 MB     

Downloading chunk 7 of 31 (10 files, size = 2.537913 MB) as Thu_Jul_16_12_14_40_2020_6.tar.gz


Downloading: 2.5 MB     

Downloading chunk 8 of 31 (10 files, size = 2.51407 MB) as Thu_Jul_16_12_14_40_2020_7.tar.gz


Downloading: 2.5 MB     

Downloading chunk 9 of 31 (10 files, size = 2.522695 MB) as Thu_Jul_16_12_14_40_2020_8.tar.gz


Downloading: 2.5 MB       

Downloading chunk 10 of 31 (10 files, size = 2.529265 MB) as Thu_Jul_16_12_14_40_2020_9.tar.gz


Downloading: 2.5 MB     

Downloading chunk 11 of 31 (10 files, size = 2.527341 MB) as Thu_Jul_16_12_14_40_2020_10.tar.gz


Downloading: 2.5 MB       

Downloading chunk 12 of 31 (10 files, size = 2.520964 MB) as Thu_Jul_16_12_14_40_2020_11.tar.gz


Downloading: 2.5 MB       

Downloading chunk 13 of 31 (10 files, size = 2.525687 MB) as Thu_Jul_16_12_14_40_2020_12.tar.gz


Downloading: 2.5 MB       

Downloading chunk 14 of 31 (10 files, size = 2.527227 MB) as Thu_Jul_16_12_14_40_2020_13.tar.gz


Downloading: 2.5 MB       

Downloading chunk 15 of 31 (10 files, size = 2.516641 MB) as Thu_Jul_16_12_14_40_2020_14.tar.gz


Downloading: 2.5 MB       

Downloading chunk 16 of 31 (10 files, size = 2.527569 MB) as Thu_Jul_16_12_14_40_2020_15.tar.gz


Downloading: 2.5 MB       

Downloading chunk 17 of 31 (10 files, size = 2.516499 MB) as Thu_Jul_16_12_14_40_2020_16.tar.gz


Downloading: 2.5 MB     

Downloading chunk 18 of 31 (10 files, size = 2.525649 MB) as Thu_Jul_16_12_14_40_2020_17.tar.gz


Downloading: 2.5 MB     

Downloading chunk 19 of 31 (10 files, size = 2.528622 MB) as Thu_Jul_16_12_14_40_2020_18.tar.gz


Downloading: 2.5 MB       

Downloading chunk 20 of 31 (10 files, size = 2.526459 MB) as Thu_Jul_16_12_14_40_2020_19.tar.gz


Downloading: 2.5 MB       

Downloading chunk 21 of 31 (10 files, size = 2.505514 MB) as Thu_Jul_16_12_14_40_2020_20.tar.gz


Downloading: 2.5 MB       

Downloading chunk 22 of 31 (10 files, size = 2.557378 MB) as Thu_Jul_16_12_14_40_2020_21.tar.gz


Downloading: 2.6 MB       

Downloading chunk 23 of 31 (10 files, size = 2.530486 MB) as Thu_Jul_16_12_14_40_2020_22.tar.gz


Downloading: 2.5 MB       

Downloading chunk 24 of 31 (10 files, size = 2.520495 MB) as Thu_Jul_16_12_14_40_2020_23.tar.gz


Downloading: 2.5 MB       

Downloading chunk 25 of 31 (10 files, size = 2.511265 MB) as Thu_Jul_16_12_14_40_2020_24.tar.gz


Downloading: 2.5 MB     

Downloading chunk 26 of 31 (10 files, size = 2.516215 MB) as Thu_Jul_16_12_14_40_2020_25.tar.gz


Downloading: 2.5 MB       

Downloading chunk 27 of 31 (10 files, size = 2.531861 MB) as Thu_Jul_16_12_14_40_2020_26.tar.gz


Downloading: 2.5 MB       

Downloading chunk 28 of 31 (10 files, size = 2.529356 MB) as Thu_Jul_16_12_14_40_2020_27.tar.gz


Downloading: 2.5 MB       

Downloading chunk 29 of 31 (10 files, size = 2.514211 MB) as Thu_Jul_16_12_14_40_2020_28.tar.gz


Downloading: 2.5 MB       

Downloading chunk 30 of 31 (10 files, size = 2.520868 MB) as Thu_Jul_16_12_14_40_2020_29.tar.gz


Downloading: 2.5 MB       

Downloading chunk 31 of 31 (9 files, size = 2.269365 MB) as Thu_Jul_16_12_14_40_2020_30.tar.gz


Downloading: 2.3 MB       

## Prepare data for analysis

In [97]:
data <- GDCprepare(query, directory = project_dirs[proj_idx])



Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
cesc subtype information from:doi:10.1038/nature21386
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 3990


### Only want two levels

Lumping "Metastatic" and "Primary Solid Tumor" together

In [98]:
tumor_mask <- data$definition %in% tumor_levels
healthy_mask <- data$definition %in% healthy_levels
data$definition[tumor_mask] <- tumor_def
data$definition[healthy_mask] <- healthy_def
# data$definition <- as.factor(data$definition)
unique(data$definition)

### Filter out samples which are not levels of interest

In [99]:
level_mask <- data$definition %in% c(tumor_def, healthy_def)
# Filter by columns (since samples are columns)
data <- data[, level_mask]

In [100]:
data

class: RangedSummarizedExperiment 
dim: 56493 309 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(309): TCGA-JX-A3Q8-01A-11R-A21T-07
  TCGA-C5-A1BK-01B-11R-A13Y-07 ... TCGA-EK-A2R8-01A-21R-A18M-07
  TCGA-JW-A5VK-01A-11R-A28H-07
colData names(131): sample patient ... subtype_GEXP.APOBEC3H.164668
  subtype_patient

## Filter samples for levels of interest

In [101]:
ddsSE <- DESeqDataSet(data, design = ~ definition)

renaming the first element in assays to 'counts'
converting counts to integer mode
“some variables in design formula are characters, converting to factors”

In [102]:
ddsSE

class: DESeqDataSet 
dim: 56493 309 
metadata(2): data_release version
assays(1): counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(309): TCGA-JX-A3Q8-01A-11R-A21T-07
  TCGA-C5-A1BK-01B-11R-A13Y-07 ... TCGA-EK-A2R8-01A-21R-A18M-07
  TCGA-JW-A5VK-01A-11R-A28H-07
colData names(131): sample patient ... subtype_GEXP.APOBEC3H.164668
  subtype_patient

In [103]:
ddsSESeq <- DESeq(ddsSE)

estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 4519 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing


In [105]:
resultsNames(ddsSESeq)

In [108]:
res <- results(ddsSESeq, contrast = c("definition", "Tumor", "Healthy"))

In [120]:
res %>% as_tibble(rownames = "geneID")

geneID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000000003,3476.033426,0.321858005,0.4442807,0.724447462,4.687911e-01,6.388746e-01
ENSG00000000005,2.267135,-3.883578781,1.7394600,-2.232634716,2.557304e-02,8.950413e-02
ENSG00000000419,2638.597733,0.385043922,0.2700207,1.425979104,1.538744e-01,3.091112e-01
ENSG00000000457,910.186204,0.294998354,0.2780415,1.060986776,2.886959e-01,4.687732e-01
ENSG00000000460,878.032328,2.482991328,0.3159528,7.858741846,3.880110e-15,4.890596e-13
ENSG00000000938,514.668255,-1.165766106,0.5907400,-1.973399733,4.845004e-02,1.409502e-01
ENSG00000000971,3950.819717,0.254351771,0.8636045,0.294523455,7.683579e-01,8.618625e-01
ENSG00000001036,3845.645354,0.511220332,0.3921115,1.303762662,1.923145e-01,3.583954e-01
ENSG00000001084,4025.114912,1.217425649,0.8198118,1.485006200,1.375422e-01,2.862656e-01
ENSG00000001167,1842.209860,0.680668798,0.2938898,2.316068120,2.055455e-02,7.664212e-02


In [119]:
head(rowData(ddsSE) %>% as_tibble())

ensembl_gene_id,external_gene_name,original_ensembl_gene_id
<chr>,<chr>,<chr>
ENSG00000000003,TSPAN6,ENSG00000000003.13
ENSG00000000005,TNMD,ENSG00000000005.5
ENSG00000000419,DPM1,ENSG00000000419.11
ENSG00000000457,SCYL3,ENSG00000000457.12
ENSG00000000460,C1orf112,ENSG00000000460.15
ENSG00000000938,FGR,ENSG00000000938.11
