In [1]:
library(tidyverse)
library(DESeq2)
library(BiocParallel)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects

In [2]:
n_cores <- detectCores()
BiocParallel::register(MulticoreParam(n_cores))

## Constants

In [3]:
data_dir <- "../../../../../mnt/d/unified_TCGA_GTEx"
dsets <- c("unified_cervical_data")
dset_paths <- unlist(map(dsets, function(d) paste0(data_dir, "/", d)))
analysis_dir <- paste(data_dir, "analysis", sep = "/")
matrisome_list <- paste(data_dir, "matrisome", "matrisome_hs_masterlist.tsv", sep = "/")
dset_idx <- 1

## Functions

In [4]:
run_DESeq_and_get_results <- function(dds) {
    #We want results without outlier removal or independent filtering since filtering should happen downstream.
    #See docs: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#how-can-i-get-unfiltered-deseq2-results
    dds_seq <- DESeq(dds, minReplicatesForReplace = Inf, parallel = TRUE)
    res <- results(
        dds_seq,
        contrast = c("condition", "tumor", "healthy"),
        pAdjustMethod = "BH",
        cooksCutoff=FALSE,
        independentFiltering=FALSE,
        parallel = TRUE
    )
    return(as_tibble(res, rownames = "geneID"))
}

## Read in data

In [5]:
counts <- read_tsv(paste0(dset_paths[dset_idx], "/counts.tsv")) %>%
    select(-"Entrez_Gene_Id") %>%
    mutate_if(is.numeric, round, 0) %>%
    column_to_rownames(var = "Hugo_Symbol")
coldata <- read_tsv(paste0(dset_paths[dset_idx], "/coldata.tsv")) %>%
    column_to_rownames(var = "sample_name")
all(rownames(coldata) == colnames(counts))

Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)


In [6]:
sum(coldata$condition == "healthy")
sum(coldata$condition == "tumor")

## RUN DGE Analysis

- `dds_naive`: measure the effect of `condition`
- `dds_informed`: measure the effect of `condition`, controlling for `data_source` (batch effect)

See docs: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#quick-start

In [7]:
dds_naive <- DESeqDataSetFromMatrix(
    countData = counts,
    colData = coldata,
    design = ~ condition
)

dds_informed <- DESeqDataSetFromMatrix(
    countData = counts,
    colData = coldata,
    design = ~ data_source + condition
)

converting counts to integer mode
“some variables in design formula are characters, converting to factors”converting counts to integer mode
“some variables in design formula are characters, converting to factors”

In [8]:
dge_res_naive_df <- run_DESeq_and_get_results(dds_naive)
dge_res_informed_df <- run_DESeq_and_get_results(dds_informed)

estimating size factors
estimating dispersions
gene-wise dispersion estimates: 16 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 16 workers
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 16 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 16 workers


In [13]:
sig_dge_res_naive_df <- dplyr::filter(dge_res_naive_df, padj < 0.05)
sig_dge_res_informed_df <- dplyr::filter(dge_res_informed_df, padj < 0.05)
nrow(sig_dge_res_naive_df)
nrow(sig_dge_res_informed_df)

## Join with matrisome list

### Load matrisome list and prep

In [10]:
matrisome_df <- read_tsv(matrisome_list, quote = "")
colnames(matrisome_df) <- map(sub(" ", "_", colnames(matrisome_df)), tolower)
matrisome_df <- select(matrisome_df, gene_symbol, everything()) %>%
    dplyr::filter(division != "Retired")
nrow(matrisome_df)
head(matrisome_df)
paste0("Any genes missing division entries? ", any(is.na(matrisome_df$division)))

Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


gene_symbol,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
ABI3BP,Core matrisome,ECM Glycoproteins,"ABI family, member 3 (NESH) binding protein",FLJ41743|FLJ41754|NESHBP|TARSH,17265,17265,B4DSV9:D3YTG3:E9PPR9:E9PRB5:H0Y897:H0YCG4:H0YCP4:H0YDN0:H0YDW0:H0YEA0:H0YEL2:H0YF18:H0YF57:H7C4H3:H7C4N5:H7C4S3:H7C4T1:H7C4X4:H7C524:H7C556:H7C5S3:Q5JPC9:Q7Z7G0,NP_056244.2:XP_005247340.1,Mouse:Abi3bp|,
ADIPOQ,Core matrisome,ECM Glycoproteins,"adiponectin, C1Q and collagen domain containing",ACDC|ACRP30|ADIPQTL1|ADPN|APM-1|APM1|GBP28|adipone,13633,13633,Q15848,NP_001171271.1:NP_004788.1,Mouse:Adipoq|,
AEBP1,Core matrisome,ECM Glycoproteins,AE binding protein 1,ACLP; FLJ33612,303,303,C9JLQ8:H7C0W8:H7C1J5:H7C391:H7C3D7:H7C4B5:Q8IUX7,NP_001120.3,Mouse:Aebp1|,
AGRN,Core matrisome,ECM Glycoproteins,agrin,FLJ45064,329,329,H0Y5U1:O00468,NP_940978.2:XP_005244806.1:XP_006710696.1,Mouse:Agrn|,
AMBN,Core matrisome,ECM Glycoproteins,ameloblastin (enamel matrix protein),-,452,452,Q9NP70,NP_057603.1,Mouse:Ambn|,
AMELX,Core matrisome,ECM Glycoproteins,"amelogenin (amelogenesis imperfecta 1, X-linked)",AIH1|ALGN|AMG|AMGL|AMGX,461,461,Q99217,NP_001133.1:NP_872621.1:NP_872622.1,Mouse:Amelx|,


In [17]:
dge_matrisome_left_join_df <- left_join(sig_dge_res_informed_df, matrisome_df, by = c("geneID" = "gene_symbol")) %>%
    mutate(in_matrisome = !is.na(division)) %>%    # We know a priori that division is never NA
    select(geneID:padj, in_matrisome, everything())

In [22]:
nrow(sig_dge_res_informed_df) == nrow(dge_matrisome_left_join_df)    # This must be true since we want to retain all DEGs
head(dge_matrisome_left_join_df)

geneID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,in_matrisome,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
RADIL,104.931376,-3.023204,1.1379214,-2.656777,0.007889156,0.0374763,False,,,,,,,,,,
AP1M2,3874.620691,6.494125,0.6262721,10.369495,3.4132260000000002e-25,6.901612e-23,False,,,,,,,,,,
KLF1,6.54729,4.035969,1.5879516,2.541619,0.01103403,0.04838536,False,,,,,,,,,,
USHBP1,83.240457,-4.113287,0.7125654,-5.772504,7.810209e-09,2.489566e-07,False,,,,,,,,,,
NKPD1,151.738326,4.604885,1.1473048,4.013654,5.978591e-05,0.0006941963,False,,,,,,,,,,
SLC26A8,7.200758,5.513994,1.8815845,2.930506,0.003384109,0.01933554,False,,,,,,,,,,


In [34]:
matrisome_category_counts_df <- dge_matrisome_left_join_df %>%
    dplyr::count(category) %>%
    mutate(n_genes = n) %>%
    select(-n) %>%
    filter(!is.na(category))
matrisome_category_counts_df
paste0("Total matrisome genes which are significant DEGs: ", sum(matrisome_category_counts_df$n_genes))

category,n_genes
<chr>,<int>
Collagens,8
ECM Glycoproteins,56
ECM Regulators,74
ECM-affiliated Proteins,54
Proteoglycans,10
Secreted Factors,110
