In [1]:
suppressMessages(library(tidyverse))     # tidyverse will pull in ggplot2, readr, other useful libraries
suppressMessages(library(magrittr))      # provides the %>% operator
suppressMessages(library("genefilter"))
suppressMessages(library(DESeq2))
suppressMessages(library(tximport))
suppressMessages(library("readr"))
suppressMessages(library("tximportData"))
suppressMessages(library(data.table))

suppressMessages(library(matrixStats))
suppressMessages(library(org.Hs.eg.db))
suppressMessages(library(topGO))

suppressMessages(library(pheatmap))
suppressMessages(library(gplots))
suppressMessages(library(matrixStats))

annotation <- fread(file="../../../0_metadata/gencode.biotype.name.key.tsv")

options(stringsAsFactors = FALSE)

“package ‘magrittr’ was built under R version 4.1.3”


---
## Load in Meta Data

In [2]:
meta_data <- read.delim("../../../0_metadata/cfrna.tsv") %>% 
    filter(sample_group_matched %in% c(2,4, 31))

samples <- meta_data

GROUPS = c("COVID-19", "Control_Non-inflammatory")

samples$expGroup <- samples$Diagnosis


table(samples$expGroup)


output_file = "2_ALL-results.rds"


Control_Non-inflammatory                 COVID-19 
                      13                       13 

---

In [3]:
##------------------------------------
# load count data with tximport
sample_ids = unique(samples$cfrna_file_id)                                                                    # Get sample ids that pass qc

counts = read.delim("../../../1_sample-output/cfrna_ftcounts.txt")
rownames(counts) <- counts$Geneid

counts = counts[,sample_ids]

##------------------------------------
# Remove ChrX, ChrY, ChrM, and RB genes
globin = c('HBA1','HBA2','HBB','HBBP1','HBD',
     'HBE1','HBG1','HBG2','HBM','HBQ1',
     'HBZ','HBZP1')
globin = annotation %>% filter(gene_name %in% all_of(globin)) %>% pull(gene_id) %>% length()

gene.list <- read.delim("../../../0_metadata/genelist.hs.tsv",col.names = c("type,","ENSMBL","gene_symbol"))

gene.ids <- gsub("\\..*","",rownames(counts))

exclude.idx <- gene.ids %in% c(gene.list[,2], globin)

counts = counts[!exclude.idx,]  


##------------------------------------
# Contstruct DESeq Data Set
dds <- DESeqDataSetFromMatrix(round(counts),
                                colData = samples,
                                design = ~ expGroup + 0)

##------------------------------------
# Add Gene metadata
annotation <- annotation[match(rownames(dds), annotation$gene_id),]
all(rownames(dds) == annotation$ftcount_id)
mcols(dds) <- cbind(mcols(dds), annotation)


##------------------------------------
# Re-factor
dds$expGroup <- factor(dds$expGroup, levels = GROUPS)

##------------------------------------
# Pre-filter
# keep <- rowSums(counts(dds)) >= 10
# dds <- dds[keep,]

##------------------------------------
# DAA
dds <- DESeq(dds)

##------------------------------------
# Results
res <- results(dds,alpha=0.05, contrast = c("expGroup",GROUPS))

res$gene_name <- mcols(dds)$gene_name
res$gene_type <- mcols(dds)$gene_type

##------------------------------------
# Save
rslts <- list()
rslts[['dds']] <- dds
rslts[['res']] <- res
saveRDS(rslts, file = output_file)

# data.frame(res) %>% rownames_to_column(var="GeneID") %>% write.table("./DESeq2_output_tables/cf_covid-cntrl_paired_DESeq.tsv",sep="\t",row.names = FALSE)

converting counts to integer mode

“some variables in design formula are characters, converting to factors”
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters



estimating size factors

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

final dispersion estimates

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

fitting model and testing

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
