In [1]:
library(tidyverse)
library(TCGAbiolinks)
library(DESeq2)
library(BiocParallel)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects

In [2]:
n_cores <- detectCores()
BiocParallel::register(MulticoreParam(n_cores))

## Functions

In [3]:
# Filter out rows (genes) which are not expressed in any sample
filter_unexpressed <- function(d) {
    not_expr_mask <- rowSums(DESeq2::counts(d)) == 0
    return(d[!not_expr_mask, ])
}

## Set Constants

In [4]:
data_dir <- "../../../../../mnt/d/unified_TCGA_GTEx"
dsets <- c("unified_cervical_data", "unified_uterine_data")
dset_paths <- unlist(map(dsets, function(d) paste0(data_dir, "/", d)))
dset_idx <- 1

## Load in data
Must be loaded in as `DESeq2()` expects to receive it:
- `counts` must have gene identifiers as its `rownames()` attribute
- `counts` must contain integer values
- `coldata` must have sample IDs as its `rownames()` attribute
- we must verify that ordering is consistent between `counts` and `coldata`

In [5]:
counts <- read_tsv(paste0(dset_paths[dset_idx], "/counts.tsv")) %>%
    select(-"Entrez_Gene_Id") %>%
    mutate_if(is.numeric, round, 0) %>%
    column_to_rownames(var = "Hugo_Symbol")
coldata <- read_tsv(paste0(dset_paths[dset_idx], "/coldata.tsv")) %>%
    column_to_rownames(var = "sample_name")
all(rownames(coldata) == colnames(counts))

Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)


## Build a `DESeqDataSet` from the loaded data

In [6]:
dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = coldata,
                              design = ~ condition)

converting counts to integer mode
“some variables in design formula are characters, converting to factors”

In [7]:
dds

class: DESeqDataSet 
dim: 20242 272 
metadata(1): version
assays(1): counts
rownames(20242): FAM208A RADIL ... SNRNP48 C9orf171
rowData names(0):
colnames(272): GTEX-S32W-1626-SM-4AD6G GTEX-S32W-1526-SM-4AD6Z ...
  TCGA-C5-A1MN-01A-11R-A14Y-07 TCGA-Q1-A6DT-01A-11R-A32P-07
colData names(2): condition data_source

In [8]:
sum(coldata$condition == "healthy")
sum(coldata$condition == "tumor")

## Filter out genes which are not expressed in any sample (in either condition)

In [9]:
dds_filtered <- filter_unexpressed(dds)

In [10]:
dds_filtered

class: DESeqDataSet 
dim: 20018 272 
metadata(1): version
assays(1): counts
rownames(20018): FAM208A RADIL ... SNRNP48 C9orf171
rowData names(0):
colnames(272): GTEX-S32W-1626-SM-4AD6G GTEX-S32W-1526-SM-4AD6Z ...
  TCGA-C5-A1MN-01A-11R-A14Y-07 TCGA-Q1-A6DT-01A-11R-A32P-07
colData names(2): condition data_source

## Run DGE Analysis

In [11]:
ddsSeq <- DESeq(dds_filtered, parallel = TRUE)

estimating size factors
estimating dispersions
gene-wise dispersion estimates: 16 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 16 workers
-- replacing outliers and refitting for 2411 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing


In [12]:
resultsNames(ddsSeq)

In [13]:
res <- results(ddsSeq, contrast = c("condition", "tumor", "healthy"), pAdjustMethod = "BH", parallel = TRUE)

## Create DEG list

In [14]:
res_df <- as_tibble(res, rownames = "geneID") %>%
    drop_na() %>%
    arrange(padj) %>%
    filter(padj < 0.05, abs(log2FoldChange) > log2(1.5))

In [15]:
nrow(res_df)

In [16]:
res_df

geneID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
CDKN2A,8775.4426,7.912382,0.3076639,25.71762,7.426004e-146,1.457650e-141
FEN1,4805.3256,3.632869,0.1431294,25.38171,4.015187e-142,3.940705e-138
STIL,1779.3558,4.136840,0.1732805,23.87366,5.752708e-126,2.822998e-122
UBE2T,1778.2153,3.914805,0.1639108,23.88375,4.518911e-126,2.822998e-122
AURKA,2231.1018,4.469992,0.1889426,23.65794,9.779743e-124,3.839332e-120
PLK1,3501.7326,3.918527,0.1676075,23.37919,6.959381e-121,2.276761e-117
AUNIP,532.5266,4.884569,0.2159897,22.61482,3.097787e-113,8.686637e-110
SPAG5,3375.0600,3.457421,0.1534461,22.53182,2.024435e-112,4.967204e-109
MCM2,12535.6169,3.984153,0.1769401,22.51696,2.831489e-112,6.175477e-109
CKS2,3142.6215,4.336748,0.1943330,22.31606,2.580452e-110,5.065170e-107


In [23]:
dir.create(file.path(data_dir, "analysis"))
deg_dest <- paste0(data_dir, "/analysis/", dsets[dset_idx], "demo.tsv")


“'../../../../../mnt/d/unified_TCGA_GTEx/analysis' already exists”

In [22]:
write_tsv(res_df, deg_dest)