In [1]:
library(tidyverse)
library(clusterProfiler)
library(WGCNA)
library(parallel)

# Custom package
library(rutils)

-- [1mAttaching packages[22m ------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.6     [32mv[39m [34mdplyr  [39m 1.0.4
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ---------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



clusterProfiler v3.18.0  For help: https://guangchuangyu.github.io/software/clusterProfiler

If you use clusterProfiler in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes amo

In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
dset_idx <- 3

In [4]:
lnames <- load(file = paste0(dirs$data_dir, "/saved_network_objects/", unified_dsets[dset_idx], "_tumor_data.RData"))
lnames <- load(file = paste0(dirs$data_dir, "/saved_network_objects/", unified_dsets[dset_idx], "_tumor_network.RData"))

In [5]:
lfc_thresh <- log2(2)
pval_thresh <- 0.05
perm_thresh <- 0
vote_thresh <- 5
# hub_con_thresh <- 0.75
# hub_con_thresh <- 0.5
# hub_con_thresh <- 0.25
hub_con_thresh <- 0

In [6]:
gene2id_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/hugo2entrez.tsv"))
matrisome_df <- load_matrisome_df(paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv"))


[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  hugo_symbol = [31mcol_character()[39m,
  entrez_gene_id = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  Division = [31mcol_character()[39m,
  Category = [31mcol_character()[39m,
  `Gene Symbol` = [31mcol_character()[39m,
  `Gene Name` = [31mcol_character()[39m,
  Synonyms = [31mcol_character()[39m,
  HGNC_IDs = [32mcol_double()[39m,
  `HGNC_IDs Links` = [32mcol_double()[39m,
  UniProt_IDs = [31mcol_character()[39m,
  Refseq_IDs = [31mcol_character()[39m,
  Orthology = [31mcol_character()[39m,
  Notes = [31mcol_character()[39m
)




# Load results data

In [7]:
dge_df <- read_tsv(paste0(dirs$analysis_dir, "/deg/", unified_dsets[dset_idx], "_DESeq_results.tsv"))
lr_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_l1_lr_results.tsv"))
anova_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_welch_anova_results.tsv"))
network_mm_gs_df <- read_tsv(paste0(dirs$analysis_dir, "/network/", unified_dsets[dset_idx], "_gene_mm_gs.tsv"))
network_me_sig_df <- read_tsv(paste0(dirs$analysis_dir, "/network/", unified_dsets[dset_idx], "_eigengene_traits.tsv"))
condensed_me_df <- network_me_sig_df %>%
    condense_figo(include_pvals = TRUE) %>%
    dplyr::rename_if(!startsWith(colnames(.), "module"), ~ gsub("^", "me_", .))
coxph_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_coxph_results.tsv"))
cor_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_cor_results.tsv"))
mi_survival_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_MI_survival_results.tsv"))
mi_figo_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_MI_figo_results.tsv"))


[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  baseMean = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m,
  lfcSE = [32mcol_double()[39m,
  stat = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m,
  padj = [32mcol_double()[39m,
  qval = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  mean_imp_0 = [32mcol_double()[39m,
  score_pct_improvement_0 = [32mcol_double()[39m,
  mean_imp_1 = [32mcol_double()[39m,
  score_pct_improvement_1 = [32mcol_double()[39m,
  mean_imp_2 = [32mcol_double()[39m,
  score_pct_improvement_2 = [32mcol_double()[39m,
  mean_imp_3 = [32mcol_double()[39m,
  score_pct_improvement_3 = [3

# WGCNA: Identify "hub" genes

In [8]:
hub_df <- get_most_conn_genes(data_expr, module_colors, soft_power, conn_vs_hub_thresh = hub_con_thresh) %>%
    bind_rows(.id = "module") %>%
    dplyr::select(geneID, everything())
nrow(hub_df)

# DGE

In [9]:
filtered_dge_df <- dge_df %>%
    dplyr::filter(qval < 0.05 & abs(log2FoldChange) > lfc_thresh) %>%
    dplyr::rename(lfc = log2FoldChange) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, lfc, padj, qval)

In [10]:
filtered_dge_m_df <- filtered_dge_df %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)

In [11]:
deg_list <- filtered_dge_df$geneID
demg_list <- filtered_dge_m_df$geneID

# FIGO

In [12]:
filtered_lr_df <- lr_df %>%
    dplyr::select(one_of("geneID", colnames(.)[startsWith(colnames(.), "mean")])) %>%
    dplyr::mutate(votes = rowSums(.[, -1] > perm_thresh)) %>%
    dplyr::filter(votes >= vote_thresh) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything())

In [13]:
filtered_anova_df <- anova_df %>%
    dplyr::filter(qval < pval_thresh) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything())

In [14]:
filtered_mi_figo_df <- mi_figo_df %>%
    dplyr::arrange(desc(MI_est_median)) %>%
    dplyr::mutate(pct_delta_max = (MI_est_median - first(MI_est_median)) / first(MI_est_median) * 100) %>%
    dplyr::filter(pct_delta_max > -50) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything())

In [15]:
filtered_network_figo_df <- network_mm_gs_df %>%
    dplyr::select(geneID, module, mm_pval, mm_cor) %>%
    inner_join(condensed_me_df, by = "module") %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything()) %>%
    dplyr::filter(me_figo_min_qval < pval_thresh) %>%
    # Make sure genes are significant members of the module
    dplyr::filter(mm_pval < pval_thresh) %>%
    # Make sure genes are highly connected within the module
    dplyr::filter(geneID %in% hub_df$geneID)

In [16]:
umsmg_list <- filtered_lr_df$geneID %>%
    union(filtered_anova_df$geneID) %>%
    union(filtered_mi_figo_df$geneID)
nsmg_list <- filtered_network_figo_df$geneID

# Survival

In [17]:
filtered_coxph_df <- coxph_df %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::filter(gene_qval < pval_thresh)

In [18]:
filtered_cor_df <- cor_df %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::filter(qval < pval_thresh)

In [19]:
filtered_coxph_network_df <- network_mm_gs_df %>%
    dplyr::select(geneID, module, mm_pval, mm_cor) %>%
    inner_join(condensed_me_df, by = "module") %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything()) %>%
    dplyr::filter(me_vital_qval < pval_thresh) %>%
    # Make sure genes are significant members of the module
    dplyr::filter(mm_pval < pval_thresh) %>%
    # Make sure genes are highly connected within the module
    dplyr::filter(geneID %in% hub_df$geneID)

In [20]:
filtered_mi_survival_df <- mi_survival_df %>%
    dplyr::arrange(desc(MI_est_median)) %>%
    dplyr::mutate(pct_delta_max = (MI_est_median - first(MI_est_median)) / first(MI_est_median) * 100) %>%
    dplyr::filter(pct_delta_max > -50) %>%
    inner_join(gene2id_df, by = c("geneID" = "hugo_symbol")) %>%
    dplyr::select(geneID, entrez_gene_id, everything())

In [21]:
survival_list <- filtered_coxph_df$geneID %>%
    union(filtered_cor_df$geneID) %>%
    union(filtered_coxph_network_df$geneID) %>%
    union(filtered_mi_survival_df$geneID)

# Meta

## Intersect UMSMG & NSMG with DEMG

In [22]:
umsmg_demg_list <- intersect(umsmg_list, demg_list)
nsmg_demg_list <- intersect(nsmg_list, demg_list)

## Full Intersection

In [23]:
umsmg_nsmg_demg_list <- intersect(umsmg_demg_list, nsmg_demg_list)

In [24]:
# length(dge_geneIDs)
# length(dge_m_geneIDs)
# length(survival_geneIDs)
# length(figo_geneIDs)

length(deg_list)
length(demg_list)
length(umsmg_list)
length(nsmg_list)
length(umsmg_demg_list)
length(nsmg_demg_list)
length(umsmg_nsmg_demg_list)
length(survival_list)

# Save results

In [25]:
write_lines(deg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_deg_list.txt"))
write_lines(demg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_demg_list.txt"))
write_lines(umsmg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_umsmg_list.txt"))
write_lines(nsmg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_nsmg_hub_thresh_", hub_con_thresh, "_list.txt"))
write_lines(umsmg_demg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_umsmg_demg_list.txt"))
write_lines(nsmg_demg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_nsmg_hub_thresh_", hub_con_thresh, "_demg_list.txt"))
write_lines(umsmg_nsmg_demg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_umsmg_nsmg_hub_thresh_", hub_con_thresh, "_demg_list.txt"))
write_lines(survival_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_survival_list.txt"))