In [1]:
library(tidyverse)
library(matrixStats)
library(DESeq2)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count

Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The fol

In [3]:
# dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
data_dir <- "/mnt/d/unified_TCGA_GTEx_data"
analysis_dir <- "/mnt/d/2020_07_27_unified_TCGA_GTEx_analysis"
dge_res_df <- read_tsv(paste0(analysis_dir, "/unified_cervical_data_unfiltered_DESeq_results.tsv"))
counts_df <- read_tsv(paste0(data_dir, "/unified_cervical_data/counts.tsv")) %>%
    dplyr::select(-Entrez_Gene_Id) %>%
    mutate_if(is.numeric, round, 0) %>%
    dplyr::rename(geneID = Hugo_Symbol)
coldata_df <- read_tsv(paste0(paste0(data_dir, "/unified_cervical_data/coldata.tsv")))
matrisome_genes_df <- rutils::load_matrisome_df(paste0(data_dir, "/matrisome/matrisome_hs_masterlist.tsv")) %>%
    dplyr::select(gene_symbol)

Parsed with column specification:
cols(
  geneID = col_character(),
  baseMean = col_double(),
  log2FoldChange = col_double(),
  lfcSE = col_double(),
  stat = col_double(),
  pvalue = col_double(),
  padj = col_double()
)
Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


In [4]:
# In figures, there's a pretty clear cut-off near LFC = 15
outlier_genes_df <- dge_res_df %>%
    dplyr::filter(abs(log2FoldChange) > 15) %>%
    dplyr::select(geneID)
nrow(outlier_genes_df)

In [5]:
get_group_samples <- function(counts_df, coldata_df, sample_group) {
    if (sample_group == "GTEX") {
        group_samples <- (coldata_df %>%
            dplyr::filter(data_source == "GTEx"))$sample_name
    }
    else if (sample_group == "TCGA_healthy") {
        group_samples <- (coldata_df %>%
            dplyr::filter(condition == "healthy" & data_source == "TCGA"))$sample_name
    }
    else if (sample_group == "TCGA_tumor") {
        group_samples <- (coldata_df %>%
            dplyr::filter(condition == "tumor"))$sample_name
    }
    
    return(counts_df %>% dplyr::select("geneID", all_of(group_samples)))
}


get_thresh_results <- function(group_df, thresh, group_name) {
    over_thresh_str <- paste0(group_name, "_over_thresh")
    over_thresh_prop_str <- paste0(group_name, "_over_thresh_prop")

    res_df <- group_df %>%
        mutate(over_thresh = rowSums(. [, -1] > thresh)) %>%
        mutate(over_thresh_prop = over_thresh / (ncol(.) - 2)) %>%
        dplyr::rename(!!over_thresh_str := over_thresh) %>%
        dplyr::rename(!!over_thresh_prop_str := over_thresh_prop) %>%
        dplyr::select(matches(c("geneID", over_thresh_str, over_thresh_prop_str)))
    return(res_df)
}


get_thresh_results_for_all <- function(counts_df, coldata_df, group_names, thresh ) {
    df_list <- list()
    for (gn in group_names) {
        group_counts_df <- get_group_samples(counts_df, coldata_df, gn)
        thresh_res_df <- get_thresh_results(group_counts_df, thresh, gn)
        df_list[[gn]] <- thresh_res_df
    }
    return(df_list %>% purrr::reduce(inner_join, by = "geneID"))
}

In [6]:
all_counts_res_df <- get_thresh_results_for_all(counts_df, coldata_df, c("GTEX", "TCGA_healthy", "TCGA_tumor"), 1)

In [7]:
outlier_counts_res_df <- get_thresh_results_for_all(
    (outlier_genes_df %>% inner_join(counts_df, by = "geneID")),
    coldata_df, c("GTEX", "TCGA_healthy", "TCGA_tumor"),
    1
)

In [15]:
min_pct <- .3
low_count_genes_df <- all_counts_res_df %>%
    dplyr::filter(
        GTEX_over_thresh_prop > min_pct &
        TCGA_healthy_over_thresh_prop > min_pct &
        TCGA_tumor_over_thresh_prop > min_pct
    )
# How many genes filtered?
nrow(low_count_genes_df)
# proportion of total genes filtered?
nrow(low_count_genes_df) / nrow(counts_df)
# How many outlier genes does this capture?
sum(low_count_genes_df$geneID %in% outlier_genes_df$geneID)

In [12]:
nrow(counts_df) - nrow(low_count_genes_df)

In [None]:
outlier_counts_res_df %>%
    dplyr::filter(TCGA_tumor_over_thresh_prop > 0)