In [1]:
library(tidyverse)

# Custom package
library(rutils)

-- [1mAttaching packages[22m ------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.6     [32mv[39m [34mdplyr  [39m 1.0.4
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ---------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_list <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
dset_idx <- 3

In [4]:
p_thresh <- 0.05
lfc_thresh <- log2(2)
q_deg_thresh <- 0.05
q_anova_thresh <- 0.05
coxph_coeff_thresh <- 0.0
mi_thresh <- 0.0
consensus_thresh <- 0.0
consensus_n <- 5
hub_con_thresh <- 0

# Functions

In [5]:
matrisome_df <- rutils::load_matrisome_df(matrisome_list) %>%
    dplyr::select(gene_symbol, division, category)


get_consensus_col <- function(df, n, thresh = 0) {
    consensus_df <- df %>%
        dplyr::select(geneID, contains("mean")) %>%
        dplyr::mutate(consensus = rowSums(.[-1] > thresh) >= n)
    df %>% dplyr::mutate(consensus = consensus_df$consensus)
}

deg_meta <- function(df, lfc_thresh, q_thresh, n) {
    deg_df <- df %>%
        dplyr::filter(abs(lfc) > lfc_thresh, qval < q_thresh)
    list(
        n_deg = nrow(deg_df),
        deg_prop = nrow(deg_df) / n,
        n_up = nrow(deg_df %>% dplyr::filter(lfc > 0)),
        n_down = nrow(deg_df %>% dplyr::filter(lfc < 0)),
        genes = deg_df$geneID
    )
}

simple_test_meta <- function(df, q_thresh) {
    filt_df <- df %>%
        dplyr::filter(qval < q_thresh)
    list(n_sig = nrow(filt_df), genes = filt_df$geneID)
}

mi_meta <- function(df, pct_max_thresh) {
    ord_df <- mi_figo_results_df %>%
        dplyr::arrange(desc(mi_est)) %>%
        dplyr::mutate(pct_max = mi_est / first(mi_est) * 100) %>%
        dplyr::filter(pct_max > pct_max_thresh)
    list(
        n_mi = nrow(ord_df),
        genes = ord_df$geneID
    )
}

lr_l1_meta <- function(score_df, res_df, baseline_df) {
    consensus_genes <- res_df %>%
        dplyr::filter(consensus == TRUE) %>%
        pull(geneID)
    avg_score <- mean(score_df$ref_score)
    naive_pct_imp <- (avg_score - baseline_df$naive) / baseline_df$naive * 100
    mc_pct_imp <- (avg_score - baseline_df$mc) / baseline_df$mc * 100
    list(
        avg_score = avg_score,
        naive_pct_imp = naive_pct_imp,
        mc_pct_imp = mc_pct_imp,
        n_genes = length(consensus_genes),
        genes = consensus_genes
    )
}


[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------------------------------[39m
cols(
  Division = [31mcol_character()[39m,
  Category = [31mcol_character()[39m,
  `Gene Symbol` = [31mcol_character()[39m,
  `Gene Name` = [31mcol_character()[39m,
  Synonyms = [31mcol_character()[39m,
  HGNC_IDs = [32mcol_double()[39m,
  `HGNC_IDs Links` = [32mcol_double()[39m,
  UniProt_IDs = [31mcol_character()[39m,
  Refseq_IDs = [31mcol_character()[39m,
  Orthology = [31mcol_character()[39m,
  Notes = [31mcol_character()[39m
)




# Data

In [6]:
norm_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/", "norm_counts.tsv"))
m_norm_counts_df <- norm_counts_df %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)
deseq_results_df <- read_tsv(paste0(dirs$analysis_dir, "/deg/", unified_dsets[dset_idx], "_DESeq_results.tsv")) %>%
    rename(base_mean = baseMean, lfc = log2FoldChange)
welch_anova_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_welch_anova_results.tsv"))
mi_figo_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_MI_figo_results.tsv")) %>%
    dplyr::rename(mi_est = MI_est_median)
cls_baselines_df <- read_tsv(paste0(dirs$analysis_dir, "/meta/", "cls_baselines.tsv"))
f1_l1_lr_scores_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_l1_lr_ref_scores.tsv"))
f1_l1_lr_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_l1_lr_results.tsv")) %>%
    get_consensus_col(n = consensus_n, thresh = consensus_thresh)


[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  geneID = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.



[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  baseMean = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m,
  lfcSE = [32mcol_double()[39m,
  stat = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m,
  padj = [32mcol_double()[39m,
  qval = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-----------------------------------------------------------------------------------------------------------

# DGE analysis

## DEG

In [7]:
deg_meta_ls <- deg_meta(deseq_results_df, lfc_thresh, q_deg_thresh, nrow(norm_counts_df))
deg_meta_ls[1:4]

## DEMG

In [8]:
m_deseq_results_df <- deseq_results_df %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)
demg_meta_ls <- deg_meta(m_deseq_results_df, lfc_thresh, q_deg_thresh, nrow(m_norm_counts_df))
demg_meta_ls[1:4]

# Uni/multivariate analysis

## Welch ANOVA

In [9]:
anova_meta_ls <- simple_test_meta(welch_anova_results_df, q_anova_thresh)
anova_meta_ls

## MI

In [10]:
mi_meta_ls <- mi_meta(mi_figo_results_df, 50)
mi_meta_ls

# ML baselines

In [11]:
cls_baselines <- as.list(cls_baselines_df[dset_idx,])
names(cls_baselines) <- c("dataset", "naive", "mc", "n_obs")
f1_macro_majority_baseline <- cls_baselines$naive
f1_macro_mc_baseline <- cls_baselines$mc
cls_baselines

# LR (L1)

In [12]:
lr_l1_meta_ls <- lr_l1_meta(f1_l1_lr_scores_df, f1_l1_lr_results_df, cls_baselines)
lr_l1_meta_ls[1:4]

In [13]:
umsmg_gold <- read_lines(paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_umsmg_list.txt"))
umsmg_new <- union(union(lr_l1_meta_ls$genes, mi_meta_ls$genes), anova_meta_ls$genes)
length(umsmg_gold)
length(umsmg_new)
length(intersect(umsmg_new, umsmg_gold))