In [7]:
library(tidyverse)
library(DESeq2)
library(BiocParallel)

In [8]:
n_cores <- detectCores()
BiocParallel::register(MulticoreParam(n_cores))

## Constants

In [9]:
# Read in root data directory
con <- file("../dev_paths.txt", "r")
data_dir <- readLines(con,n=1)
close(con)

dsets <- c("unified_cervical_data")
dset_paths <- unlist(map(dsets, function(d) paste0(data_dir, "/", d)))
analysis_dir <- paste(data_dir, "analysis", sep = "/")
matrisome_list <- paste(data_dir, "matrisome", "matrisome_hs_masterlist.tsv", sep = "/")
dset_idx <- 1

## Functions

In [10]:
run_DESeq_and_get_results <- function(dds) {
    #We want results without outlier removal or independent filtering since filtering should happen downstream.
    #See docs: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#how-can-i-get-unfiltered-deseq2-results
    dds_seq <- DESeq(dds, minReplicatesForReplace = Inf, parallel = TRUE)
    res <- results(
        dds_seq,
        contrast = c("condition", "tumor", "healthy"),
        pAdjustMethod = "BH",
        cooksCutoff=FALSE,
        independentFiltering=FALSE,
        parallel = TRUE
    )
    return(as_tibble(res, rownames = "geneID"))
}

## Read in data

In [11]:
counts <- read_tsv(paste0(dset_paths[dset_idx], "/counts.tsv")) %>%
    select(-"Entrez_Gene_Id") %>%
    mutate_if(is.numeric, round, 0) %>%
    column_to_rownames(var = "Hugo_Symbol")
coldata <- read_tsv(paste0(dset_paths[dset_idx], "/coldata.tsv")) %>%
    column_to_rownames(var = "sample_name")
all(rownames(coldata) == colnames(counts))

Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)


In [12]:
sum(coldata$condition == "healthy")
sum(coldata$condition == "tumor")

## RUN DGE Analysis

- `dds_naive`: measure the effect of `condition`
- `dds_informed`: measure the effect of `condition`, controlling for `data_source` (batch effect)

See docs: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#quick-start

In [13]:
dds_naive <- DESeqDataSetFromMatrix(
    countData = counts,
    colData = coldata,
    design = ~ condition
)

dds_informed <- DESeqDataSetFromMatrix(
    countData = counts,
    colData = coldata,
    design = ~ data_source + condition
)

converting counts to integer mode
“some variables in design formula are characters, converting to factors”converting counts to integer mode
“some variables in design formula are characters, converting to factors”

In [14]:
dge_res_naive_df <- run_DESeq_and_get_results(dds_naive)
dge_res_informed_df <- run_DESeq_and_get_results(dds_informed)

estimating size factors
estimating dispersions
gene-wise dispersion estimates: 16 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 16 workers
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 16 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 16 workers


### How many significant DEGs? (p-value < 0.05)

In [15]:
sig_dge_res_naive_df <- dplyr::filter(dge_res_naive_df, padj < 0.05)
sig_dge_res_informed_df <- dplyr::filter(dge_res_informed_df, padj < 0.05)
nrow(sig_dge_res_naive_df)
nrow(sig_dge_res_informed_df)

## Save results

In [16]:
write_tsv(dge_res_informed_df, paste0(analysis_dir, "/", dsets[dset_idx], "_unfiltered_DESeq_results.tsv"))