# Preliminary TCGA DGE analysis
### Primary concerns about this analysis:
1. Low statistical power: healthy condition has few samples
2. Missing data: ovarian cancer cohort has no healthy samples

In [1]:
library(tidyverse)
library(TCGAbiolinks)
library(DESeq2)
library(BiocParallel)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects

## Take advantage of parallelization in DESeq2 functions

In [2]:
n_cores <- detectCores()
BiocParallel::register(MulticoreParam(n_cores))

## Constants

In [255]:
projects <- c("TCGA-UCEC", "TCGA-CESC")
data_root <- "../../../../../mnt/d/TCGA"
project_dirs <- sapply(
    projects, 
    (function(p) paste0(data_root, "/", as.character(Sys.Date()), "-", p, "-", "TCGAbiolinks"))
)
tumor_levels <- c("Primary solid Tumor", "Metastatic")
healthy_levels <- c("Solid Tissue Normal")
tumor_def <- "Tumor"
healthy_def <- "Healthy"
proj_idx <- 2
# max_padj <- 0.05
# min_l2fc <- log2(2)

## Functions

In [262]:
consolidate_levels <- function(d, old_tumor_levels, old_healthy_levels, new_tumor_level, new_healthy_level, drop_remaining = TRUE) {
    tumor_mask <- d$definition %in% old_tumor_levels
    healthy_mask <- d$definition %in% old_healthy_levels
    d$definition[tumor_mask] <- new_tumor_level
    d$definition[healthy_mask] <- new_healthy_level
    
    if (drop_remaining) {
        level_mask <- d$definition %in% c(new_tumor_level, new_healthy_level)
        d <- d[, level_mask]
    }
}


filter_DGE_res <- function(df, max_padj = 0.05, min_l2fc = log2(2)) {
    padj_mask <- df$padj <= max_padj
    l2fc_mask <- df$log2FoldChange >= min_l2fc
    final_mask <- padj_mask & l2fc_mask
    return(df[final_mask, ])
}

## Run Query & download

In [5]:
query <- GDCquery(
    project = projects[proj_idx],
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    workflow.type = "HTSeq - Counts"
)

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-CESC
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------


In [6]:
GDCdownload(query, method = "api", directory = project_dirs[proj_idx], files.per.chunk = 10)

Downloading data for project TCGA-CESC
Of the 309 files for download 309 already exist.
All samples have been already downloaded


## Prepare data for analysis

In [263]:
data <- GDCprepare(query, directory = project_dirs[proj_idx])



Starting to add information to samples
 => Add clinical information to samples
Add FFPE information. More information at: 
=> https://cancergenome.nih.gov/cancersselected/biospeccriteria 
=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
 => Adding subtype information to samples
cesc subtype information from:doi:10.1038/nature21386
Accessing www.ensembl.org to get gene information
Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
Cache found
From the 60483 genes we couldn't map 3990


In [277]:
data

class: RangedSummarizedExperiment 
dim: 56493 309 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(309): TCGA-JX-A3Q8-01A-11R-A21T-07
  TCGA-C5-A1BK-01B-11R-A13Y-07 ... TCGA-EK-A2R8-01A-21R-A18M-07
  TCGA-JW-A5VK-01A-11R-A28H-07
colData names(131): sample patient ... subtype_GEXP.APOBEC3H.164668
  subtype_patient

### Only want two levels

Lump "Metastatic" and "Primary Solid Tumor" together, define "Healthy" as non-tumor, and filter out data from other levels

In [278]:
data_consolidated <- consolidate_levels(
    data,
    old_tumor_levels = tumor_levels,
    old_healthy_levels = healthy_levels,
    new_tumor_level = tumor_def,
    new_healthy_level = healthy_def,
    drop_remaining = TRUE
)

In [285]:
unique(data_consolidated$definition)
data_consolidated

class: RangedSummarizedExperiment 
dim: 56493 309 
metadata(1): data_release
assays(1): HTSeq - Counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(309): TCGA-JX-A3Q8-01A-11R-A21T-07
  TCGA-C5-A1BK-01B-11R-A13Y-07 ... TCGA-EK-A2R8-01A-21R-A18M-07
  TCGA-JW-A5VK-01A-11R-A28H-07
colData names(131): sample patient ... subtype_GEXP.APOBEC3H.164668
  subtype_patient

In [287]:
sum(data_consolidated$definition == tumor_def)
sum(data_consolidated$definition == healthy_def)

## Prep data for DGE analysis

In [288]:
ddsSE <- DESeqDataSet(data_consolidated, design = ~ definition)

renaming the first element in assays to 'counts'
converting counts to integer mode
“some variables in design formula are characters, converting to factors”

In [289]:
ddsSE

class: DESeqDataSet 
dim: 56493 309 
metadata(2): data_release version
assays(1): counts
rownames(56493): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(309): TCGA-JX-A3Q8-01A-11R-A21T-07
  TCGA-C5-A1BK-01B-11R-A13Y-07 ... TCGA-EK-A2R8-01A-21R-A18M-07
  TCGA-JW-A5VK-01A-11R-A28H-07
colData names(131): sample patient ... subtype_GEXP.APOBEC3H.164668
  subtype_patient

### Filter out genes which are not expressed in either tumor or healthy conditions
I.E. we want to keep only genes that are expressed in the healthy AND the tumor conditions

In [290]:
tumor_cond_mask <- ddsSE$definition == tumor_def
healthy_cond_mask <- ddsSE$definition == healthy_def
sum(tumor_cond_mask)
sum(healthy_cond_mask)

In [291]:
tumor_cond_expr_mask <- rowSums(counts(ddsSE[, tumor_condition_mask])) > 0
healthy_cond_expr_mask <- rowSums(counts(ddsSE[, healthy_condition_mask])) > 0
expr_mask <- tumor_cond_expr_mask & healthy_cond_expr_mask
sum(tumor_cond_expr_mask)
sum(healthy_cond_expr_mask)
sum(expr_mask)

In [292]:
ddsSE <- ddsSE[expr_mask, ]

In [293]:
ddsSE

class: DESeqDataSet 
dim: 35549 309 
metadata(2): data_release version
assays(1): counts
rownames(35549): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(3): ensembl_gene_id external_gene_name
  original_ensembl_gene_id
colnames(309): TCGA-JX-A3Q8-01A-11R-A21T-07
  TCGA-C5-A1BK-01B-11R-A13Y-07 ... TCGA-EK-A2R8-01A-21R-A18M-07
  TCGA-JW-A5VK-01A-11R-A28H-07
colData names(131): sample patient ... subtype_GEXP.APOBEC3H.164668
  subtype_patient

## Perform DGE analysis

In [294]:
ddsSESeq <- DESeq(ddsSE, parallel = TRUE)

estimating size factors
estimating dispersions
gene-wise dispersion estimates: 16 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 16 workers
-- replacing outliers and refitting for 3297 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing


In [295]:
ddsSESeq

class: DESeqDataSet 
dim: 35549 309 
metadata(2): data_release version
assays(6): counts mu ... replaceCounts replaceCooks
rownames(35549): ENSG00000000003 ENSG00000000005 ... ENSG00000281912
  ENSG00000281920
rowData names(26): ensembl_gene_id external_gene_name ... maxCooks
  replace
colnames(309): TCGA-JX-A3Q8-01A-11R-A21T-07
  TCGA-C5-A1BK-01B-11R-A13Y-07 ... TCGA-EK-A2R8-01A-21R-A18M-07
  TCGA-JW-A5VK-01A-11R-A28H-07
colData names(133): sample patient ... sizeFactor replaceable

In [296]:
resultsNames(ddsSESeq)

In [297]:
res <- results(ddsSESeq, contrast = c("definition", "Tumor", "Healthy"), pAdjustMethod = "BH", parallel = TRUE)

## Combine results with external gene IDs

In [298]:
res_df <- as_tibble(res, rownames = "geneID")
head(res_df)
ddSESeq_row_data_df <- rowData(ddsSESeq) %>% as_tibble()
head(ddSESeq_row_data_df)

geneID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000000003,3476.033426,0.3218579,0.4444177,0.7242239,0.4689283,0.6487193
ENSG00000000005,2.267135,-3.8836735,1.7357334,-2.2374828,0.0252548,0.09194335
ENSG00000000419,2638.597733,0.3850439,0.2701783,1.4251477,0.1541145,0.3193678
ENSG00000000457,910.186204,0.2949981,0.2782027,1.060371,0.2889758,0.4799699
ENSG00000000460,878.032328,2.4829943,0.3161081,7.8548899,4.001234e-15,4.944582e-13
ENSG00000000938,514.668255,-1.1657661,0.5908316,-1.9730936,0.0484849,0.1465515


ensembl_gene_id,external_gene_name,original_ensembl_gene_id,baseMean,baseVar,allZero,dispGeneEst,dispGeneIter,dispFit,dispersion,⋯,SE_definition_Tumor_vs_Healthy,WaldStatistic_Intercept,WaldStatistic_definition_Tumor_vs_Healthy,WaldPvalue_Intercept,WaldPvalue_definition_Tumor_vs_Healthy,betaConv,betaIter,deviance,maxCooks,replace
<chr>,<chr>,<chr>,<dbl>,<dbl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<dbl>,<dbl>,<dbl>,<lgl>
ENSG00000000003,TSPAN6,ENSG00000000003.13,3476.033426,3940164.0,False,0.2800062,15,0.8832757,0.2815563,⋯,0.4444177,25.87686,0.7242239,1.213545e-147,0.4689283,True,3,5466.125,0.342619187,False
ENSG00000000005,TNMD,ENSG00000000005.5,2.267135,80.77287,False,4.296538,13,1.9727695,4.2612391,⋯,1.7357334,2.81675,-2.2374828,0.004851226,0.0252548,True,6,1018.913,0.019389739,False
ENSG00000000419,DPM1,ENSG00000000419.11,2638.597733,755866.7,False,0.1075142,14,0.8835014,0.1036992,⋯,0.2701783,40.85256,1.4251477,0.0,0.1541145,True,3,5025.976,0.001500538,False
ENSG00000000457,SCYL3,ENSG00000000457.12,910.186204,92188.68,False,0.1099086,14,0.8852802,0.1091296,⋯,0.2782027,34.45117,1.060371,4.325215e-260,0.2889758,True,3,4385.427,0.194833296,False
ENSG00000000460,C1orf112,ENSG00000000460.15,878.032328,115671.7,False,0.1408486,14,0.8853796,0.1363742,⋯,0.3161081,23.22452,7.8548899,2.574325e-119,4.001234e-15,True,3,4420.954,0.276946722,False
ENSG00000000938,FGR,ENSG00000000938.11,514.668255,193824.4,False,0.49605,15,0.8873671,0.4973794,⋯,0.5908316,17.27355,-1.9730936,7.442642e-67,0.0484849,True,4,4407.542,0.144241292,False


### Make sure gene ordering is preserved between results and ddSESeq row data

In [299]:
all(ddSESeq_row_data_df$ensembl_gene_id == res_df$geneID)

In [300]:
res_genes_df <- res_df %>% mutate(
        external_gene_name = ddSESeq_row_data_df$external_gene_name,
        original_ensembl_gene_id = ddSESeq_row_data_df$original_ensembl_gene_id,
        ensembl_gene_id = res_df$geneID
    ) %>%
    select(-geneID) %>%
    select(ensembl_gene_id, original_ensembl_gene_id, external_gene_name, everything())

In [301]:
dim(res_genes_df)

## Create DEG list

### Drop rows (genes) with NA values

In [302]:
na_mask <- rowSums(is.na(res_genes_df)) > 0
res_genes_df <- res_genes_df[!na_mask, ]

No NAs left?

In [303]:
sum(rowSums(is.na(res_genes_df))) == 0

How many rows left?

In [304]:
nrow(res_genes_df)

### Apply final filters (adj. $p$-values, $\log_2$ fold-change)

In [305]:
DEG1_df <- filter_DGE_res(res_genes_df, max_padj = 0.05, min_l2fc = log2(2))

How many rows left?

In [306]:
nrow(DEG1_df)