In [217]:
library(tidyverse)
library(clusterProfiler)
library(WGCNA)
library(parallel)

# Custom package
library(rutils)

In [218]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

event_code <- list("Alive" = 0, "Dead" = 1)

In [219]:
dset_idx <- 3

In [220]:
lnames <- load(file = paste0(dirs$data_dir, "/saved_network_objects/", unified_dsets[dset_idx], "_tumor_data.RData"))
lnames <- load(file = paste0(dirs$data_dir, "/saved_network_objects/", unified_dsets[dset_idx], "_tumor_network.RData"))

In [221]:
lfc_thresh <- log2(2)
pval_thresh <- 0.05
perm_thresh <- 0
vote_thresh <- 5
# hub_con_thresh <- 0.5
hub_con_thresh <- 0.25

In [222]:
gene2id_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/hugo2entrez.tsv"))
matrisome_df <- load_matrisome_df(paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv"))


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  hugo_symbol = [31mcol_character()[39m,
  entrez_gene_id = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  Division = [31mcol_character()[39m,
  Category = [31mcol_character()[39m,
  `Gene Symbol` = [31mcol_character()[39m,
  `Gene Name` = [31mcol_character()[39m,
  Synonyms = [31mcol_character()[39m,
  HGNC_IDs = [32mcol_double()[39m,
  `HGNC_IDs Links` = [32mcol_double()[39m,
  UniProt_IDs = [31mcol_character()[39m,
  Refseq_IDs = [31mcol_character()[39m,
  Orthology = [31mcol_character()[39m,
  Notes = [31mcol_character()

# Load results data

In [223]:
dge_df <- read_tsv(paste0(dirs$analysis_dir, "/deg/", unified_dsets[dset_idx], "_DESeq_results.tsv"))
lr_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_l1_lr_results.tsv"))
anova_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_welch_anova_results.tsv"))
network_mm_gs_df <- read_tsv(paste0(dirs$analysis_dir, "/network/", unified_dsets[dset_idx], "_gene_mm_gs.tsv"))
network_me_sig_df <- read_tsv(paste0(dirs$analysis_dir, "/network/", unified_dsets[dset_idx], "_eigengene_traits.tsv"))
condensed_me_df <- network_me_sig_df %>%
    condense_figo(include_pvals = TRUE) %>%
    dplyr::rename_if(!startsWith(colnames(.), "module"), ~ gsub("^", "me_", .))
coxph_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_coxph_results.tsv"))
cor_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_cor_results.tsv"))
mi_survival_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_MI_survival_results.tsv"))
mi_figo_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_MI_figo_results.tsv"))


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  baseMean = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m,
  lfcSE = [32mcol_double()[39m,
  stat = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m,
  padj = [32mcol_double()[39m,
  qval = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  mean_imp_0 = [32mcol_double()[39m,
  score_pct_improvement_0 = [32mcol_double()[39m,
  mean_imp_1 = [32mcol_double()[39m,
  score_pct_improvement_1 = [32mcol_double()[39m,
  mean_imp_2 = [32mcol_dou

# WGCNA: Identify "hub" genes

In [224]:
hub_df <- get_most_conn_genes(data_expr, module_colors, soft_power, conn_vs_hub_thresh = hub_con_thresh) %>%
    bind_rows(.id = "module") %>%
    dplyr::select(geneID, everything())

# DGE

In [225]:
filtered_dge_df <- dge_df %>%
    dplyr::filter(qval < 0.05 & abs(log2FoldChange) > lfc_thresh) %>%
    dplyr::rename(lfc = log2FoldChange) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, lfc, padj, qval)

In [226]:
filtered_dge_m_df <- filtered_dge_df %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)

In [227]:
dge_geneIDs <- filtered_dge_df$geneID
dge_m_geneIDs <- filtered_dge_m_df$geneID

# FIGO

In [228]:
filtered_lr_df <- lr_df %>%
    dplyr::select(one_of("geneID", colnames(.)[startsWith(colnames(.), "mean")])) %>%
    dplyr::mutate(votes = rowSums(.[, -1] > perm_thresh)) %>%
    dplyr::filter(votes >= vote_thresh) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything())

In [229]:
filtered_anova_df <- anova_df %>%
    dplyr::filter(qval < pval_thresh) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything())

In [230]:
filtered_mi_figo_df <- mi_figo_df %>%
    dplyr::arrange(desc(MI_est_median)) %>%
    dplyr::mutate(pct_delta_max = (MI_est_median - first(MI_est_median)) / first(MI_est_median) * 100) %>%
    dplyr::filter(pct_delta_max > -50) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything())

In [231]:
filtered_figo_network_df <- network_mm_gs_df %>%
    dplyr::select(geneID, module, mm_pval, mm_cor) %>%
    inner_join(condensed_me_df, by = "module") %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything()) %>%
    dplyr::filter(me_figo_min_qval < pval_thresh) %>%
    # Make sure genes are significant members of the module
    dplyr::filter(mm_pval < pval_thresh) %>%
    # Make sure genes are highly connected within the module
    dplyr::filter(geneID %in% hub_df$geneID)

In [232]:
figo_entrezIDs <- filtered_lr_df$entrez_gene_id %>% 
    union(filtered_anova_df$entrez_gene_id) %>% 
    union(filtered_figo_network_df$entrez_gene_id) %>%
    union(filtered_mi_figo_df$entrez_gene_id)

figo_geneIDs <- filtered_lr_df$geneID %>% 
    union(filtered_anova_df$geneID) %>% 
    union(filtered_figo_network_df$geneID) %>%
    union(filtered_mi_figo_df$geneID)

In [233]:
length(figo_entrezIDs) == length(figo_geneIDs)
length(figo_geneIDs)

In [234]:
univar_figo_list <- filtered_lr_df$geneID %>% 
    union(filtered_anova_df$geneID) %>%
    union(filtered_mi_figo_df$geneID)

# Survival

In [235]:
filtered_coxph_df <- coxph_df %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::filter(gene_qval < pval_thresh)

In [236]:
filtered_cor_df <- cor_df %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::filter(qval < pval_thresh)

In [237]:
filtered_coxph_network_df <- network_mm_gs_df %>%
    dplyr::select(geneID, module, mm_pval, mm_cor) %>%
    inner_join(condensed_me_df, by = "module") %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything()) %>%
    dplyr::filter(me_vital_qval < pval_thresh) %>%
    # Make sure genes are significant members of the module
    dplyr::filter(mm_pval < pval_thresh) %>%
    # Make sure genes are highly connected within the module
    dplyr::filter(geneID %in% hub_df$geneID)

In [238]:
filtered_mi_survival_df <- mi_survival_df %>%
    dplyr::arrange(desc(MI_est_median)) %>%
    dplyr::mutate(pct_delta_max = (MI_est_median - first(MI_est_median)) / first(MI_est_median) * 100) %>%
    dplyr::filter(pct_delta_max > -50) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything())

In [239]:
survival_entrezIDs <- filtered_coxph_df$entrez_gene_id %>%
    union(filtered_cor_df$entrez_gene_id) %>%
    union(filtered_coxph_network_df$entrez_gene_id) %>%
    union(filtered_mi_survival_df$entrez_gene_id)

survival_geneIDs <- filtered_coxph_df$geneID %>%
    union(filtered_cor_df$geneID) %>%
    union(filtered_coxph_network_df$geneID) %>%
    union(filtered_mi_survival_df$geneID)

In [240]:
length(survival_geneIDs) == length(survival_geneIDs)
length(survival_geneIDs)

In [241]:
length(dge_geneIDs)
length(dge_m_geneIDs)
length(survival_geneIDs)
length(figo_geneIDs)

# Save results

In [242]:
write_lines(dge_geneIDs, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_dge_gene_list.txt"))
write_lines(dge_m_geneIDs, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_dge_m_gene_list.txt"))
write_lines(figo_geneIDs, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_figo_hub_thresh_", hub_con_thresh, "_gene_list.txt"))
write_lines(survival_geneIDs, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_survival_hub_thresh_", hub_con_thresh, "_gene_list.txt"))

In [243]:
write_lines(filtered_figo_network_df$geneID, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_figo_network_hub_thresh_", hub_con_thresh, "_gene_list.txt"))
write_lines(univar_figo_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_figo_univar_gene_list.txt"))