In [1]:
library(tidyverse)
library(DESeq2)
library(BiocParallel)

# Custom package
library(rutils)

-- [1mAttaching packages[22m ----------------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.6     [32mv[39m [34mdplyr  [39m 1.0.4
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m -------------------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: 'BiocGenerics'


The following objects are masked from 'package:parallel':

    clusterAp

In [2]:
# Define constants and load data
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
projects <- c("cesc", "ucs", "ucec")
dset_paths <- unlist(map(dsets, function(d) paste0(dirs$data_dir, "/", d)))
matrisome_list <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
dset_idx <- 3

In [4]:
# We want to be very picky for this analysis
alpha_level <- 0.01
expr_thresh <- 0
lfc_thresh <- log2(2)
save_results <- TRUE

In [5]:
# Helper functions
get_DESeq_results <- function(dds, contrast_vec) {
    contrast_levels <- purrr::map(contrast_vec, function(s) { str_sub(s, -1) })[2:3] %>% unlist()
    c_num <- contrast_levels[1]
    c_denom <- contrast_levels[2]
    contrast_suffix <- paste0(c_num, "_vs_", c_denom)
    
    res <- results(
        dds,
        contrast = contrast_vec,
        pAdjustMethod = "BH",
        alpha = alpha_level,
        parallel = TRUE
    ) %>%
        as_tibble(rownames = "geneID") %>%
        dplyr:::mutate(contrast = contrast_suffix) %>%
        dplyr::rename(base_mean = baseMean, l2fc = log2FoldChange, pval = pvalue)
}

# Load data

In [6]:
counts_df <- read_tsv(paste0(dset_paths[dset_idx], "/counts.tsv")) %>%
        mutate_if(is.numeric, round, 0) %>%
        dplyr::select(-Entrez_Gene_Id) %>%
        dplyr::rename(geneID = Hugo_Symbol)
coldata_df <- read_tsv(paste0(dset_paths[dset_idx], "/coldata.tsv"))
survival_df <- read_tsv(paste0(dset_paths[dset_idx], "/survival_data.tsv"))
all(coldata_df$sample_name == colnames(counts_df[, -1]))

matrisome_genes <- rutils::load_matrisome_df(matrisome_list) %>%
    dplyr::pull(gene_symbol)


[36m--[39m [1m[1mColumn specification[1m[22m [36m----------------------------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  Hugo_Symbol = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.



[36m--[39m [1m[1mColumn specification[1m[22m [36m----------------------------------------------------------------------------------------------------------[39m
cols(
  sample_name = [31mcol_character()[39m,
  condition = [31mcol_character()[39m,
  data_source = [31mcol_character()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m----------------------------------------------------------------------------------------------------------[39m
cols(
  sample_name = [31mcol_character()[39m,
  vital_status = [31mcol_character()[39m,
  survival_time = [32mcol_double()[39m,
  figo_stage = [31mcol_character()[39m,



[36m--[39m [1m[1mColumn specification[1m[22m [36m----------------------------------------------------------------------------------------------------------[39m
cols(
  Division = [31mcol_character()[39m,
  Category = [31mcol_character()[39m,
  `Gene Symbol` = [31mcol_character()[39m,
  `Gene Name` = [31mcol_character()[39m,
  Synonyms = [31mcol_character()[39m,
  HGNC_IDs = [32mcol_double()[39m,
  `HGNC_IDs Links` = [32mcol_double()[39m,
  UniProt_IDs = [31mcol_character()[39m,
  Refseq_IDs = [31mcol_character()[39m,
  Orthology = [31mcol_character()[39m,
  Notes = [31mcol_character()[39m
)




# Pre-filter genes

In [7]:
# Pre-filter genes the same as our initial DGE analysis
all_counts_res_df <- rutils::get_unified_thresh_results_for_all(
    counts_df,
    coldata_df,
    c("GTEX", "TCGA_healthy", "TCGA_tumor"),
    thresh = 0
)

# Gene must be expressed in > 1/3 of all samples (including non-cancer samples)
sufficiently_expr_genes_df <- all_counts_res_df %>%
    dplyr::filter(tot_over_thresh_prop > 1/3)

nrow(sufficiently_expr_genes_df)
# Proportion of genes which will be kept
nrow(sufficiently_expr_genes_df) / nrow(counts_df)

# Final data prep

In [8]:
figo_coldata_df <- coldata_df %>%
    inner_join(survival_df, by = "sample_name") %>%
    select(sample_name, condition, figo_stage, data_source) %>%
    filter(condition == "tumor") %>%
    # Only want samples with FIGO stage data reported
    filter(!is.na(figo_stage)) %>%
    # Collapse into FIGO stages I-IV
    rutils::decode_figo_stage(to = "c")

In [9]:
filtered_counts_df <- counts_df %>%
    dplyr::filter(geneID %in% sufficiently_expr_genes_df$geneID) %>%
    # Remove samples which have no FIGO stage data
    select(one_of("geneID", figo_coldata_df$sample_name))

In [10]:
# One more double check coldata samples match count samples
all(figo_coldata_df$sample_name == colnames(filtered_counts_df[, -1]))

# DGE analysis

In [11]:
dds <- DESeqDataSetFromMatrix(
    countData = filtered_counts_df %>% column_to_rownames(var = "geneID"),
    colData = figo_coldata_df %>% column_to_rownames(var = "sample_name"),
    design = ~ figo_stage
)
dds_seq <- DESeq(dds, parallel = TRUE)

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates: 14 workers

mean-dispersion relationship

final dispersion estimates, fitting model and testing: 14 workers

-- replacing outliers and refitting for 1545 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

estimating dispersions

fitting model and testing



In [12]:
figo_de_ls <- list(
    "figo_2_vs_1" = get_DESeq_results(dds_seq, c("figo_stage", "figo_stage_2", "figo_stage_1")),
    "figo_3_vs_1" = get_DESeq_results(dds_seq, c("figo_stage", "figo_stage_3", "figo_stage_1")),
    "figo_4_vs_1" = get_DESeq_results(dds_seq, c("figo_stage", "figo_stage_4", "figo_stage_1")),
    "figo_3_vs_2" = get_DESeq_results(dds_seq, c("figo_stage", "figo_stage_3", "figo_stage_2")),
    "figo_4_vs_2" = get_DESeq_results(dds_seq, c("figo_stage", "figo_stage_4", "figo_stage_2")),
    "figo_4_vs_3" = get_DESeq_results(dds_seq, c("figo_stage", "figo_stage_4", "figo_stage_3"))
)

In [13]:
figo_sig_genes <- figo_de_ls %>%
    map(function(df) {
        df %>% dplyr::filter(geneID %in% matrisome_genes, padj < alpha_level, abs(l2fc) > lfc_thresh) %>%
        pull(geneID) 
    })

# DEMGs in each contrast
figo_sig_genes %>% map(length)
# DEMGs unioned
Reduce(union, figo_sig_genes) %>% length()

In [14]:
figo_mat_de_df <- bind_rows(figo_de_ls) %>%
    select(geneID, base_mean, l2fc, pval, padj, contrast) %>%
    filter(geneID %in% matrisome_genes)

In [15]:
if (save_results) {
    figo_mat_de_df %>% write_tsv(paste0(dirs$analysis_dir, "/feature_selection/", dsets[dset_idx], "_figo_pairwise_demg_results.tsv"))
}