In [1]:
library(tidyverse)

# Custom package
library(rutils)

-- [1mAttaching packages[22m ------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.6     [32mv[39m [34mdplyr  [39m 1.0.4
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ---------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_list <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
dset_idx <- 1
save_lists <- FALSE

In [4]:
p_thresh <- 0.05
lfc_thresh <- log2(2)
# hub_con_thresh <- 0
q_deg_thresh <- 0.05
q_anova_thresh <- 0.05
padj_pairwise_figo_thresh <- 0.01
q_me_thresh <- 0.05
cph_coeff_thresh <- 0.0
q_pbc_thresh <- 0.05
q_cts_thresh <- 0.05
q_univ_surv_thresh <- 0.05
p_mm_thresh <- 0.05

# Data

In [5]:
matrisome_df <- rutils::load_matrisome_df(matrisome_list) %>%
    dplyr::select(gene_symbol, division, category)
norm_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/", "norm_counts.tsv"))
m_norm_counts_df <- norm_counts_df %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)

deseq_results_df <- read_tsv(paste0(dirs$analysis_dir, "/deg/", unified_dsets[dset_idx], "_DESeq_results.tsv")) %>%
    rename(base_mean = baseMean, lfc = log2FoldChange)

en_cph_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_en_cph_results.tsv"))
en_cph_coef_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_en_cph_full_best_kept_coef.tsv"))
cts_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_survival_cts_results.tsv"))
univ_survival_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_univ_survival_results.tsv"))

en_multinom_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_en_multinom_results.tsv"))
en_multinom_coef_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_en_multinom_full_best_kept_coef.tsv"))
pbc_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_pbc_results.tsv"))
# anova_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_welch_anova_results.tsv"))
figo_pairwise_demg_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_figo_pairwise_demg_results.tsv"))

network_mm_gs_df <- read_tsv(paste0(dirs$analysis_dir, "/network/", unified_dsets[dset_idx], "_gene_mm_gs.tsv"))
network_me_sig_df <- read_tsv(paste0(dirs$analysis_dir, "/network/", unified_dsets[dset_idx], "_eigengene_traits.tsv"))
lnames <- load(file = paste0(dirs$data_dir, "/saved_network_objects/", unified_dsets[dset_idx], "_tumor_data.RData"))
lnames <- load(file = paste0(dirs$data_dir, "/saved_network_objects/", unified_dsets[dset_idx], "_tumor_network.RData"))


[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  Division = [31mcol_character()[39m,
  Category = [31mcol_character()[39m,
  `Gene Symbol` = [31mcol_character()[39m,
  `Gene Name` = [31mcol_character()[39m,
  Synonyms = [31mcol_character()[39m,
  HGNC_IDs = [32mcol_double()[39m,
  `HGNC_IDs Links` = [32mcol_double()[39m,
  UniProt_IDs = [31mcol_character()[39m,
  Refseq_IDs = [31mcol_character()[39m,
  Orthology = [31mcol_character()[39m,
  Notes = [31mcol_character()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  geneID = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.



[36m--[39m [1m[1mColumn specification[1

# DGE analysis

## DEG

In [6]:
deg_meta_ls <- deg_meta(deseq_results_df, lfc_thresh, q_deg_thresh, nrow(norm_counts_df))
deg_meta_ls[1:4]

## DEMG

In [7]:
m_deseq_results_df <- deseq_results_df %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)
demg_meta_ls <- deg_meta(m_deseq_results_df, lfc_thresh, q_deg_thresh, nrow(m_norm_counts_df))
demg_meta_ls[1:4]

In [8]:
matrisome_df %>%
    filter(gene_symbol %in% norm_counts_df$geneID) %>%
    rename(geneID = gene_symbol) %>%
    left_join(deseq_results_df, by = c("geneID")) %>%
    mutate(de = abs(lfc) > lfc_thresh & qval < q_deg_thresh) %>%
    group_by(category) %>%
    summarize(n_cat = n(), pct_de = round(sum(de, na.rm = TRUE) / n_cat * 100))
# deseq_results_df %>%
#     inner_join(matrisome_df, by = c("geneID" = "gene_symbol")) %>%
#     select(geneID, category, lfc, qval) %>%
#     mutate(de = )

Unnamed: 0_level_0,category,n_cat,pct_de
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Collagens,43,70
2,ECM Glycoproteins,192,61
3,ECM Regulators,233,56
4,ECM-affiliated Proteins,168,57
5,Proteoglycans,35,71
6,Secreted Factors,337,58


# Uni/multivariate analysis

## FIGO

### LASSO multinomial regression

In [9]:
en_multinom_meta_ls <- list(n_var = nrow(en_multinom_df), genes = en_multinom_df$geneID)

en_multinom_meta_ls[["demg_genes"]] <- en_multinom_meta_ls$genes %>% intersect(demg_meta_ls$genes)
en_multinom_meta_ls[["n_var_demg"]] <- en_multinom_meta_ls$demg_genes %>% length()
en_multinom_meta_ls[1]
en_multinom_meta_ls[4]

In [10]:
# How did the best model perform vs. the used (kept) model?
en_multinom_coef_df %>% filter(best != 0, geneID %in% demg_meta_ls$genes) %>% nrow()
en_multinom_coef_df %>% filter(kept != 0, geneID %in% demg_meta_ls$genes) %>% nrow()
en_multinom_meta_ls[["n_best_lambda_var_demg"]] <- en_multinom_coef_df %>% filter(best != 0, geneID %in% demg_meta_ls$genes) %>% nrow()

### Welch ANOVA

In [11]:
# # anova_meta_ls <- simple_test_meta(anova_df, q_anova_thresh)
# anova_meta_df <- anova_df %>%
#     filter(qval < q_anova_thresh)
# anova_meta_ls <- list(n_sig = nrow(anova_meta_df), genes = anova_meta_df$geneID)
# anova_meta_ls[1]

# FIGO pairwise DGE Analysis

In [12]:
figo_pairwise_demg_meta_ls <- list()
figo_pairwise_demg_meta_ls[["genes"]] <- figo_pairwise_demg_results_df %>%
    filter(padj < padj_pairwise_figo_thresh, abs(l2fc) > lfc_thresh) %>%
    group_by(geneID) %>%
    summarize(n = n()) %>%
    pull(geneID)
figo_pairwise_demg_meta_ls[["n_sig"]] <- length(figo_pairwise_demg_meta_ls[["genes"]])
figo_pairwise_demg_meta_ls[["demg_genes"]] <- figo_pairwise_demg_meta_ls$genes %>% intersect(demg_meta_ls$genes)
figo_pairwise_demg_meta_ls[["n_sig_demg"]] <- figo_pairwise_demg_meta_ls$demg_genes %>% length()
figo_pairwise_demg_meta_ls[2]
figo_pairwise_demg_meta_ls[4]

### Point-biserial correlation with FIGO

In [13]:
pbc_meta_df <- pbc_df %>%
    condense_figo(include_pvals = TRUE) %>%
    dplyr::filter(figo_min_qval < q_pbc_thresh)
pbc_meta_ls <- list(n_sig = nrow(pbc_meta_df), genes = pbc_meta_df$geneID)
pbc_meta_ls[["demg_genes"]] <- pbc_meta_ls$genes %>% intersect(demg_meta_ls$genes)
pbc_meta_ls[["n_sig_demg"]] <- pbc_meta_ls$demg_genes %>% length()
pbc_meta_ls[1]
pbc_meta_ls[4]

In [14]:
length(intersect(en_multinom_meta_ls$demg_genes, figo_pairwise_demg_meta_ls$demg_genes))
length(intersect(en_multinom_meta_ls$demg_genes, pbc_meta_ls$demg_genes))
length(intersect(figo_pairwise_demg_meta_ls$demg_genes, pbc_meta_ls$demg_genes))

In [15]:
en_multinom_meta_ls$demg_genes %>%
    union(figo_pairwise_demg_meta_ls$demg_genes) %>%
    union(pbc_meta_ls$demg_genes) %>%
    length()

## Survival

### LASSO Cox PH

In [16]:
en_cph_meta_ls <- list(n_var = nrow(en_cph_df), genes = en_cph_df$geneID)
en_cph_meta_ls[["demg_genes"]] <- en_cph_meta_ls$genes %>% intersect(demg_meta_ls$genes)
en_cph_meta_ls[["n_var_demg"]] <- en_cph_meta_ls$demg_genes %>% length()
en_cph_meta_ls[1]
en_cph_meta_ls[4]

In [17]:
# How did the best model perform vs. the used (kept) model?
en_cph_coef_df %>% filter(best != 0, geneID %in% demg_meta_ls$genes) %>% nrow()
en_cph_coef_df %>% filter(kept != 0, geneID %in% demg_meta_ls$genes) %>% nrow()
en_cph_meta_ls[["n_best_lambda_var_demg"]] <- en_cph_coef_df %>% filter(best != 0, geneID %in% demg_meta_ls$genes) %>% nrow()

In [18]:
en_cph_meta_ls$n_best_lambda_var_demg

### Univariate KM/Cox PH

In [19]:
univ_survival_meta_df <- univ_survival_df %>%
    rowwise() %>%
    mutate(min_qval = min(km_qval, cph_qval)) %>%
    as_tibble() %>%
    filter(min_qval < q_univ_surv_thresh)

univ_survival_meta_ls <- list(n_sig = nrow(univ_survival_meta_df), genes = univ_survival_meta_df$geneID)
univ_survival_meta_ls[["demg_genes"]] <- univ_survival_meta_ls$genes %>% intersect(demg_meta_ls$genes)
univ_survival_meta_ls[["n_sig_demg"]] <- univ_survival_meta_ls$demg_genes %>% length()
univ_survival_meta_ls[1]
univ_survival_meta_ls[4]

In [20]:
univ_survival_meta_ls$demg_genes

### Censored time screen

In [21]:
cts_meta_df <- cts_df %>%
    dplyr::filter(vital_qval < q_cts_thresh)
cts_meta_ls <- list(n_sig = nrow(cts_meta_df), genes = cts_meta_df$geneID)
cts_meta_ls[["demg_genes"]] <- cts_meta_ls$genes %>% intersect(demg_meta_ls$genes)
cts_meta_ls[["n_sig_demg"]] <- cts_meta_ls$demg_genes %>% length()
cts_meta_ls[1]
cts_meta_ls[4]

In [22]:
cts_meta_ls$demg_genes

In [23]:
length(intersect(en_cph_meta_ls$demg_genes, cts_meta_ls$demg_genes))
length(intersect(en_cph_meta_ls$demg_genes, univ_survival_meta_ls$demg_genes))
length(intersect(univ_survival_meta_ls$demg_genes, cts_meta_ls$demg_genes))

In [24]:
en_cph_meta_ls$demg_genes %>%
    union(univ_survival_meta_ls$demg_genes) %>%
    union(cts_meta_ls$demg_genes) %>%
    length()

# WGCNA

In [25]:
# wgcna_meta_ls <- wgcna_meta(network_me_sig_df, network_mm_gs_df, q_me_thresh, p_mm_thresh, hub_df$geneID)
wgcna_meta_ls <- wgcna_meta(network_me_sig_df, network_mm_gs_df, q_me_thresh, p_mm_thresh, colnames(data_expr))
wgcna_meta_ls[1:2]

In [26]:
wgcna_meta_ls$modules

In [27]:
condensed_network_me_sig_df <- network_me_sig_df %>%
    condense_figo(include_pvals = TRUE) %>%
    dplyr::rename_if(!startsWith(colnames(.), "module"), ~ gsub("^", "me_", .))
condensed_network_me_sig_df %>%
    filter(me_figo_min_qval < q_me_thresh)

module,me_vital_hr,me_vital_dev_cor,me_vital_pval,me_vital_qval,me_figo_min_pval,me_figo_min_qval,me_figo_max_cor
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
blue,2.156241,0.01356141,0.71165627,0.3365102,0.004447037,0.03193255,0.1775888
pink,3.928428,0.04560698,0.44909764,0.24097788,0.002389876,0.01835782,0.1893874
red,62.777203,0.10725144,0.04867081,0.06709864,0.006805758,0.0261392,0.1690778


In [28]:
network_mm_gs_df %>%
    select(geneID, module, mm_cor, mm_pval) %>%
    inner_join(condensed_network_me_sig_df, by = "module") %>%
    filter(mm_pval < p_mm_thresh, me_figo_min_qval < q_me_thresh) %>%
    group_by(module) %>%
    summarize(n = n())
    

Unnamed: 0_level_0,module,n
Unnamed: 0_level_1,<chr>,<int>
1,blue,174
2,pink,33
3,red,47


## TOM meta

In [29]:
colnames(tom) <- colnames(data_expr)
rownames(tom) <- colnames(data_expr)
rowSums(tom) %>%
    quantile() %>%
    round(digits = 1)

# Create Lists

In [30]:
deg_list <- deg_meta_ls$genes
demg_list <- demg_meta_ls$genes
figo_umsmg_list <- en_multinom_meta_ls$genes %>%
    union(pbc_meta_ls$genes) %>%
#     union(anova_meta_ls$genes)
    union(figo_pairwise_demg_meta_ls$genes)
survival_umsmg_list <- en_cph_meta_ls$genes %>%
    union(cts_meta_ls$genes) %>%
    union(univ_survival_meta_ls$genes)
# all_umsmg_list <- figo_umsmg_list %>%
#     intersect(survival_umsmg_list)
figo_nsmg_list <- wgcna_meta_ls$genes
# figo_umsmg_demg_list <- figo_umsmg_list %>%
#     intersect(demg_list)
# survival_umsmg_demg_list <- survival_umsmg_list %>%
#     intersect(demg_list)
# figo_umsmg_nsmg_demg_list <- figo_umsmg_list %>%
#     intersect(figo_nsmg_list) %>%
#     intersect(demg_list)
figo_list <- figo_umsmg_list %>%
    union(figo_nsmg_list) %>%
    intersect(demg_list)
survival_list <- survival_umsmg_list %>%
    intersect(demg_list)
full_overlap_list <- figo_list %>%
    intersect(survival_list)

In [31]:
meta_counts <- tibble(
    n_deg = length(deg_list),
    deg_pct_de = deg_meta_ls$deg_prop,
    deg_up = deg_meta_ls$n_up,
    deg_down = deg_meta_ls$n_down,
    n_demg = length(demg_list),
    demg_pct_de = demg_meta_ls$deg_prop,
    demg_up = demg_meta_ls$n_up,
    demg_down = demg_meta_ls$n_down,
    n_lasso_multinom = en_multinom_meta_ls$n_var,
    n_lasso_multinom_demg = en_multinom_meta_ls$n_var_demg,
    n_lasso_multinom_best_lambda_demg = en_multinom_meta_ls$n_best_lambda_var_demg,
#     n_anova = anova_meta_ls$n_sig,
    n_figo_pairwise = figo_pairwise_demg_meta_ls$n_sig,
    n_figo_pairwise_demg = figo_pairwise_demg_meta_ls$n_sig_demg,
    n_pbc = pbc_meta_ls$n_sig,
    n_pbc_demg = pbc_meta_ls$n_sig_demg,
    n_lasso_cox = en_cph_meta_ls$n_var,
    n_lasso_cox_demg = en_cph_meta_ls$n_var_demg,
    n_lasso_cox_best_lambda_demg = en_cph_meta_ls$n_best_lambda_var_demg,
    n_uv_cph_km = univ_survival_meta_ls$n_sig,
    n_uv_cph_km_demg = univ_survival_meta_ls$n_sig_demg,
    n_cts = cts_meta_ls$n_sig,
    n_cts_demg = cts_meta_ls$n_sig_demg,
    n_modules = wgcna_meta_ls$n_sig_modules,
    n_module_genes = wgcna_meta_ls$n_sig_genes,
    n_figo_umsmg = length(figo_umsmg_list),
    n_figo_nsmg = length(figo_nsmg_list),
    n_surv_umsmg = length(survival_umsmg_list),
    n_figo_umsmg_and_demg = length(figo_umsmg_list %>% intersect(demg_list)),
    n_figo_nsmg_and_demg = length(figo_nsmg_list %>% intersect(demg_list)),
    n_figo_umsmg_or_nsmg = union(figo_umsmg_list, figo_nsmg_list) %>% length(),
    n_figo_tot = length(figo_list),
    n_surv_tot = length(survival_list),
    n_full_overlap = length(full_overlap_list)
) %>%
    t()
colnames(meta_counts) <- c("val")
meta_counts_df <- meta_counts %>%
    as_tibble(rownames = "metric") %>%
    mutate(val = round(val, 2))
meta_counts_df

# length(deg_list)
# length(demg_list)
# length(figo_umsmg_list)
# length(figo_nsmg_list)
# length(survival_umsmg_list)
# length(all_umsmg_list)
# length(figo_umsmg_demg_list)
# length(survival_umsmg_demg_list)
# length(figo_umsmg_nsmg_demg_list)

metric,val
<chr>,<dbl>
n_deg,7652.0
deg_pct_de,0.38
deg_up,3966.0
deg_down,3686.0
n_demg,593.0
demg_pct_de,0.59
demg_up,262.0
demg_down,331.0
n_lasso_multinom,220.0
n_lasso_multinom_demg,105.0


# Save lists

In [32]:
if (save_lists) {
    write_lines(deg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_deg_list.txt"))
    write_lines(demg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_demg_list.txt"))
    write_lines(figo_umsmg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_figo_umsmg_list.txt"))
    write_lines(survival_umsmg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_survival_umsmg_list.txt"))
    write_lines(figo_nsmg_list, paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_figo_nsmg_list.txt"))
    meta_counts_df %>% write_tsv(paste0(dirs$analysis_dir, "/meta/", unified_dsets[dset_idx], "_meta_counts.tsv"))
}