In [27]:
library(tidyverse)

# Custom package
library(rutils)

In [28]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_list <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [29]:
dset_idx <- 3

In [30]:
p_thresh = 0.05
lfc_thresh = log2(2)
coxph_coeff_thresh = 0.0
mi_thresh = 0.0
consensus_thresh = 0.0
consensus_n = 5

# Helpers

In [31]:
matrisome_df <- rutils::load_matrisome_df(matrisome_list) %>%
    dplyr::select(gene_symbol, division, category)


get_consensus_col <- function(df, n, thresh = 0) {
    consensus_df <- df %>%
        dplyr::select(geneID, contains("mean")) %>%
        dplyr::mutate(consensus = rowSums(.[-1] > thresh) == n)
    return(df %>% dplyr::mutate(consensus = consensus_df$consensus))
}


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  Division = [31mcol_character()[39m,
  Category = [31mcol_character()[39m,
  `Gene Symbol` = [31mcol_character()[39m,
  `Gene Name` = [31mcol_character()[39m,
  Synonyms = [31mcol_character()[39m,
  HGNC_IDs = [32mcol_double()[39m,
  `HGNC_IDs Links` = [32mcol_double()[39m,
  UniProt_IDs = [31mcol_character()[39m,
  Refseq_IDs = [31mcol_character()[39m,
  Orthology = [31mcol_character()[39m,
  Notes = [31mcol_character()[39m
)




In [32]:
norm_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/", "norm_counts.tsv"))


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  geneID = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.




# DGE analysis

In [33]:
DESeq_results_df <- read_tsv(paste0(dirs$analysis_dir, "/deg/", unified_dsets[dset_idx], "_DESeq_results.tsv"))
# filtered_DESeq_results_df <- DESeq_results_df %>%
#     dplyr::filter(abs(log2FoldChange) > lfc_thresh, padj < p_thresh)
filtered_DESeq_results_df <- DESeq_results_df %>%
    dplyr::filter(abs(log2FoldChange) > lfc_thresh, qval < p_thresh)

filtered_matrisome_DESeq_results_df <- filtered_DESeq_results_df %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)

n_degs <- nrow(filtered_DESeq_results_df)
deg_prop <- nrow(filtered_DESeq_results_df) / nrow(norm_counts_df)
n_degs_up <- nrow(filtered_DESeq_results_df %>% dplyr::filter(log2FoldChange > 0))
n_degs_down <- nrow(filtered_DESeq_results_df %>% dplyr::filter(log2FoldChange < 0))

n_matrisome_degs <- nrow(filtered_matrisome_DESeq_results_df)
matrisome_deg_prop <- nrow(filtered_matrisome_DESeq_results_df) / nrow(matrisome_df)
n_matrisome_degs_up <- nrow(filtered_matrisome_DESeq_results_df %>% dplyr::filter(log2FoldChange > 0))
n_matrisome_degs_down <- nrow(filtered_matrisome_DESeq_results_df %>% dplyr::filter(log2FoldChange < 0))


paste0("DEGs: ", n_degs)
paste0("DEG prop: ", deg_prop)
paste0("Up: ", n_degs_up)
paste0("Down: ", n_degs_down)
paste0("Matrisome DEGs: ", n_matrisome_degs)
paste0("Matrisome DEG prop: ", matrisome_deg_prop)
paste0("Matrisome up: ", n_matrisome_degs_up)
paste0("Matrisome down: ", n_matrisome_degs_down)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  baseMean = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m,
  lfcSE = [32mcol_double()[39m,
  stat = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m,
  padj = [32mcol_double()[39m,
  qval = [32mcol_double()[39m
)




# Survival analysis

In [34]:
coxph_null_scores_df <- read_tsv(paste0(dirs$analysis_dir, "/meta/", "coxph_null_scores.tsv"))
coxph_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_coxph_results.tsv"))
# filtered_coxph_results_df <- coxph_results_df %>%
#     dplyr::filter(gene_pval < p_thresh)
filtered_coxph_results_df <- coxph_results_df %>%
    dplyr::filter(gene_qval < p_thresh)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  dataset = [31mcol_character()[39m,
  lr_test_pval = [32mcol_double()[39m,
  wald_test_pval = [32mcol_double()[39m,
  score_test_pval = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  gene_pval = [32mcol_double()[39m,
  gene_coeff = [32mcol_double()[39m,
  gene_qval = [32mcol_double()[39m,
  gene_padj = [32mcol_double()[39m
)




In [35]:
coxph_null_scores_df

dataset,lr_test_pval,wald_test_pval,score_test_pval
<chr>,<dbl>,<dbl>,<dbl>
unified_cervical_data,0.0004814013,4.9053e-05,1.224173e-06
unified_uterine_data,0.3392807906,0.989893132,0.3053979
unified_uterine_endometrial_data,0.000773901,0.0,1.040876e-06


In [36]:
coxph_null_sig <- (coxph_null_scores_df %>% dplyr::filter(dataset == unified_dsets[dset_idx]))$lr_test_pval < p_thresh
n_coxph_sig <- nrow(filtered_coxph_results_df)
prop_coxph_sig <- n_coxph_sig / nrow(matrisome_df)
n_coxph_sig_protective <- nrow(filtered_coxph_results_df %>% dplyr::filter(gene_coeff < 0))
n_coxph_sig_harmful <- nrow(filtered_coxph_results_df %>% dplyr::filter(gene_coeff > 0))


paste0("Sig. null Cox PH: ", coxph_null_sig)
paste0("Sig. Cox PH genes: ", n_coxph_sig)
paste0("Prop. sig. Cox PH genes: ", prop_coxph_sig)
paste0("Sig. Cox PH protective: ", n_coxph_sig_protective)
paste0("Sig. Cox PH harmful: ", n_coxph_sig_harmful)

# Regression

## Correlation analysis

In [37]:
cor_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_cor_results.tsv"))

# filtered_cor_results_df <- cor_results_df %>% dplyr::filter(pval < p_thresh)
# filtered_cor_results_df <- cor_results_df %>% dplyr::filter(padj < p_thresh)
filtered_cor_results_df <- cor_results_df %>% dplyr::filter(qval < p_thresh)

n_cor <- nrow(filtered_cor_results_df)
n_cor_down <- nrow(filtered_cor_results_df %>% dplyr::filter(cor < 0))
n_cor_up <- nrow(filtered_cor_results_df %>% dplyr::filter(cor > 0))

paste0("Cor. genes: ", n_cor)
paste0("Neg. cor. genes: ", n_cor_down)
paste0("Pos. cor. genes: ", n_cor_up)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  cor = [32mcol_double()[39m,
  pval = [32mcol_double()[39m,
  padj = [32mcol_double()[39m,
  qval = [32mcol_double()[39m,
  n = [32mcol_double()[39m
)




## MI analysis

In [38]:
mi_survival_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_MI_survival_results.tsv"))
filtered_mi_survival_results_df <- mi_survival_results_df %>% dplyr::filter(MI_est_median > mi_thresh)

n_mi <- nrow(filtered_mi_survival_results_df)

paste0("MI: ", n_mi)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  MI_est_median = [32mcol_double()[39m
)




In [39]:
mi_survival_ordered_df <- mi_survival_results_df %>%
    dplyr::arrange(desc(MI_est_median)) %>%
    dplyr::mutate(geneID_f = factor(geneID, levels = geneID)) %>%
    dplyr::filter(MI_est_median > 0.0)
nrow(mi_survival_ordered_df %>%
    dplyr::mutate(pct_delta_max = (MI_est_median - first(MI_est_median)) / first(MI_est_median) * 100) %>%
    dplyr::filter(pct_delta_max > -50))

# ggplot(data = mi_survival_ordered_df) +
#     geom_bar(aes(x = geneID_f, y = MI_est_median), stat = "identity") +
#     theme_classic() +
#     theme(
#         axis.text.x = element_blank(),
#         axis.ticks.x = element_blank()
#     ) +
#     labs(x = "Matrisome gene", y = "Mutual information score")


## Baselines

In [40]:
reg_baselines_df <- read_tsv(paste0(dirs$analysis_dir, "/meta/", "reg_baselines.tsv"))
mse_baseline = (reg_baselines_df %>% filter(dataset == unified_dsets[dset_idx]))$L2
mae_baseline = (reg_baselines_df %>% filter(dataset == unified_dsets[dset_idx]))$L1
ev_baseline = (reg_baselines_df %>% filter(dataset == unified_dsets[dset_idx]))$explained_variance
paste0("MSE baseline: ", mse_baseline)
paste0("MAE baseline: ", mae_baseline)
paste0("EV baseline: ", ev_baseline)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  dataset = [31mcol_character()[39m,
  L2 = [32mcol_double()[39m,
  L1 = [32mcol_double()[39m,
  R2 = [32mcol_double()[39m,
  explained_variance = [32mcol_double()[39m,
  n = [32mcol_double()[39m
)




## GBR (MSE)

In [41]:
mse_gbr_scores_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_mse_gbr_ref_scores.tsv"))
mse_gbr_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_mse_gbr_results.tsv"))
mse_gbr_results_df <- get_consensus_col(mse_gbr_results_df, n = consensus_n, thresh = consensus_thresh)
mse_gbr_scores_df

# mult. by -1 to convert neg MSE -> MSE
mse_gbr_avg <- mean(-mse_gbr_scores_df$ref_score)
mse_gbr_imp <- mse_gbr_avg < mse_baseline
# mult. by -1 since pct. "improvement" is pct. reduction in this case
mse_gbr_baseline_pct_imp <- - (mse_gbr_avg - mse_baseline) / mse_baseline * 100
n_mse_gbr_consensus_genes <- nrow(mse_gbr_results_df %>% dplyr::filter(consensus == TRUE))

paste0("GBR MSE avg.: ", mse_gbr_avg)
paste0("GBR MSE impr. over baseline: ", mse_gbr_imp)
paste0("GBR MSE pct. impr. over baseline: ", mse_gbr_baseline_pct_imp)
paste0("GBR MSE consensus genes: ", n_mse_gbr_consensus_genes)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  model = [32mcol_double()[39m,
  ref_score = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  mean_imp_0 = [32mcol_double()[39m,
  score_pct_improvement_0 = [32mcol_double()[39m,
  mean_imp_1 = [32mcol_double()[39m,
  score_pct_improvement_1 = [32mcol_double()[39m,
  mean_imp_2 = [32mcol_double()[39m,
  score_pct_improvement_2 = [32mcol_double()[39m,
  mean_imp_3 = [32mcol_double()[39m,
  score_pct_improvement_3 = [32mcol_double()[39m,
  mean_imp_4 = [32mcol_double()[39m,
  score_pct_improvem

model,ref_score
<dbl>,<dbl>
0,-424103.6
1,-410631.6
2,-439290.4
3,-403978.5
4,-400675.8


## RFR (MSE)

In [42]:
mse_rfr_scores_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_mse_rfr_ref_scores.tsv"))
mse_rfr_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_mse_rfr_results.tsv"))
mse_rfr_results_df <- get_consensus_col(mse_rfr_results_df, n = consensus_n, thresh = consensus_thresh)
mse_rfr_scores_df

# mult. by -1 to convert neg MSE -> MSE
mse_rfr_avg <- mean(-mse_rfr_scores_df$ref_score)
mse_rfr_imp <- mse_rfr_avg < mse_baseline
# mult. by -1 since pct. "improvement" is pct. reduction in this case
mse_rfr_baseline_pct_imp <- - (mse_rfr_avg - mse_baseline) / mse_baseline * 100
n_mse_rfr_consensus_genes <- nrow(mse_rfr_results_df %>% dplyr::filter(consensus == TRUE))

paste0("RFR MSE avg.: ", mse_rfr_avg)
paste0("RFR MSE impr. over baseline: ", mse_rfr_imp)
paste0("RFR MSE pct. impr. over baseline: ", mse_rfr_baseline_pct_imp)
paste0("RFR MSE consensus genes: ", n_mse_rfr_consensus_genes)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  model = [32mcol_double()[39m,
  ref_score = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  mean_imp_0 = [32mcol_double()[39m,
  score_pct_improvement_0 = [32mcol_double()[39m,
  mean_imp_1 = [32mcol_double()[39m,
  score_pct_improvement_1 = [32mcol_double()[39m,
  mean_imp_2 = [32mcol_double()[39m,
  score_pct_improvement_2 = [32mcol_double()[39m,
  mean_imp_3 = [32mcol_double()[39m,
  score_pct_improvement_3 = [32mcol_double()[39m,
  mean_imp_4 = [32mcol_double()[39m,
  score_pct_improvem

model,ref_score
<dbl>,<dbl>
0,-417613.7
1,-412636.6
2,-413562.6
3,-419644.2
4,-410025.4


# Classification

## Anova analysis

In [43]:
welch_anova_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_welch_anova_results.tsv"))
# filtered_welch_anova_results_df <- welch_anova_results_df %>% dplyr::filter(padj < p_thresh)
filtered_welch_anova_results_df <- welch_anova_results_df %>% dplyr::filter(qval < p_thresh)

n_sig <- nrow(filtered_welch_anova_results_df)

paste0("Sig. ANOVA: ", n_sig)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  pval = [32mcol_double()[39m,
  padj = [32mcol_double()[39m,
  qval = [32mcol_double()[39m
)




## MI analysis

In [44]:
mi_figo_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_MI_figo_results.tsv"))
filtered_mi_figo_results_df <- mi_figo_results_df %>% dplyr::filter(MI_est_median > mi_thresh)

n_mi <- nrow(filtered_mi_figo_results_df)

paste0("MI: ", n_mi)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  MI_est_median = [32mcol_double()[39m
)




In [45]:
mi_figo_ordered_df <- mi_figo_results_df %>%
    dplyr::arrange(desc(MI_est_median)) %>%
    dplyr::mutate(geneID_f = factor(geneID, levels = geneID)) %>%
    dplyr::filter(MI_est_median > 0.0)
nrow(mi_figo_ordered_df %>%
    dplyr::mutate(pct_delta_max = (MI_est_median - first(MI_est_median)) / first(MI_est_median) * 100) %>%
    dplyr::filter(pct_delta_max > -50))

# ggplot(data = mi_figo_ordered_df) +
#     geom_bar(aes(x = geneID_f, y = MI_est_median), stat = "identity") +
#     theme_classic() +
#     theme(
#         axis.text.x = element_blank(),
#         axis.ticks.x = element_blank()
#     ) +
#     labs(x = "Matrisome gene", y = "Mutual information score")

## Baselines

In [46]:
cls_baselines_df <- read_tsv(paste0(dirs$analysis_dir, "/meta/", "cls_baselines.tsv"))
cls_baselines_df
f1_macro_majority_baseline <- (cls_baselines_df %>% filter(dataset == unified_dsets[dset_idx]))$f1_macro_majority
f1_macro_MC_baseline <- (cls_baselines_df %>% filter(dataset == unified_dsets[dset_idx]))$f1_macro_MC


paste0("F1 macro (majority guess) baseline: ", f1_macro_majority_baseline)
paste0("F1 macro (Monte Carlo) baseline: ", f1_macro_MC_baseline)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  dataset = [31mcol_character()[39m,
  f1_macro_majority = [32mcol_double()[39m,
  f1_macro_MC = [32mcol_double()[39m,
  n = [32mcol_double()[39m
)




dataset,f1_macro_majority,f1_macro_MC,n
<chr>,<dbl>,<dbl>,<dbl>
unified_cervical_data,0.1730769,0.2487013,255
unified_uterine_data,0.140625,0.2443586,46
unified_uterine_endometrial_data,0.1738095,0.248081,137


## GBC

In [47]:
f1_gbc_scores_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_gbc_ref_scores.tsv"))
f1_gbc_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_gbc_results.tsv"))
f1_gbc_results_df <- get_consensus_col(f1_gbc_results_df, n = consensus_n, thresh = consensus_thresh)
f1_gbc_scores_df

f1_gbc_avg <- mean(f1_gbc_scores_df$ref_score)
f1_gbc_imp <- f1_gbc_avg > f1_macro_MC_baseline
f1_gbc_baseline_pct_imp <- (f1_gbc_avg - f1_macro_MC_baseline) / f1_macro_MC_baseline * 100
n_f1_gbc_consensus_genes <- nrow(f1_gbc_results_df %>% dplyr::filter(consensus == TRUE))

paste0("GBC F1 avg.: ", f1_gbc_avg)
paste0("GBC F1 impr. over baseline: ", f1_gbc_imp)
paste0("GBC F1 pct. impr. over baseline: ", f1_gbc_baseline_pct_imp)
paste0("GBC F1 consensus genes: ", n_f1_gbc_consensus_genes)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  model = [32mcol_double()[39m,
  ref_score = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  mean_imp_0 = [32mcol_double()[39m,
  score_pct_improvement_0 = [32mcol_double()[39m,
  mean_imp_1 = [32mcol_double()[39m,
  score_pct_improvement_1 = [32mcol_double()[39m,
  mean_imp_2 = [32mcol_double()[39m,
  score_pct_improvement_2 = [32mcol_double()[39m,
  mean_imp_3 = [32mcol_double()[39m,
  score_pct_improvement_3 = [32mcol_double()[39m,
  mean_imp_4 = [32mcol_double()[39m,
  score_pct_improvem

model,ref_score
<dbl>,<dbl>
0,0.3055761
1,0.3002717
2,0.2761016
3,0.2815376
4,0.2840617


## RFC

In [48]:
f1_rfc_scores_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_rfc_ref_scores.tsv"))
f1_rfc_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_rfc_results.tsv"))
f1_rfc_results_df <- get_consensus_col(f1_rfc_results_df, n = consensus_n, thresh = consensus_thresh)
f1_rfc_scores_df

f1_rfc_avg <- mean(f1_rfc_scores_df$ref_score)
f1_rfc_imp <- f1_rfc_avg > f1_macro_MC_baseline
f1_rfc_baseline_pct_imp <- (f1_rfc_avg - f1_macro_MC_baseline) / f1_macro_MC_baseline * 100
n_f1_rfc_consensus_genes <- nrow(f1_rfc_results_df %>% dplyr::filter(consensus == TRUE))

paste0("rfc F1 avg.: ", f1_rfc_avg)
paste0("rfc F1 impr. over baseline: ", f1_rfc_imp)
paste0("rfc F1 pct. impr. over baseline: ", f1_rfc_baseline_pct_imp)
paste0("rfc F1 consensus genes: ", n_f1_rfc_consensus_genes)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  model = [32mcol_double()[39m,
  ref_score = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  mean_imp_0 = [32mcol_double()[39m,
  score_pct_improvement_0 = [32mcol_double()[39m,
  mean_imp_1 = [32mcol_double()[39m,
  score_pct_improvement_1 = [32mcol_double()[39m,
  mean_imp_2 = [32mcol_double()[39m,
  score_pct_improvement_2 = [32mcol_double()[39m,
  mean_imp_3 = [32mcol_double()[39m,
  score_pct_improvement_3 = [32mcol_double()[39m,
  mean_imp_4 = [32mcol_double()[39m,
  score_pct_improvem

model,ref_score
<dbl>,<dbl>
0,0.2798849
1,0.254495
2,0.2862669
3,0.2587466
4,0.2717753


## LR (L1)

In [49]:
f1_l1_lr_scores_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_l1_lr_ref_scores.tsv"))
f1_l1_lr_results_df <- read_tsv(paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_l1_lr_results.tsv"))
f1_l1_lr_results_df <- get_consensus_col(f1_l1_lr_results_df, n = consensus_n, thresh = consensus_thresh)
f1_l1_lr_scores_df

f1_l1_lr_avg <- mean(f1_l1_lr_scores_df$ref_score)
f1_l1_lr_imp <- f1_l1_lr_avg > f1_macro_MC_baseline
f1_l1_lr_pct_imp <- (f1_l1_lr_avg - f1_macro_MC_baseline) / f1_macro_MC_baseline * 100
n_f1_l1_lr_consensus_genes <- nrow(f1_l1_lr_results_df %>% dplyr::filter(consensus == TRUE))

paste0("LR L1 F1 avg.: ", f1_l1_lr_avg)
paste0("LR L1 F1 impr. over baseline: ", f1_l1_lr_imp)
paste0("LR L1 F1 pct. impr. over baseline: ", f1_l1_lr_pct_imp)
paste0("LR L1 F1 consensus genes: ", n_f1_l1_lr_consensus_genes)


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  model = [32mcol_double()[39m,
  ref_score = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  mean_imp_0 = [32mcol_double()[39m,
  score_pct_improvement_0 = [32mcol_double()[39m,
  mean_imp_1 = [32mcol_double()[39m,
  score_pct_improvement_1 = [32mcol_double()[39m,
  mean_imp_2 = [32mcol_double()[39m,
  score_pct_improvement_2 = [32mcol_double()[39m,
  mean_imp_3 = [32mcol_double()[39m,
  score_pct_improvement_3 = [32mcol_double()[39m,
  mean_imp_4 = [32mcol_double()[39m,
  score_pct_improvem

model,ref_score
<dbl>,<dbl>
0,0.336661
1,0.3789767
2,0.3565851
3,0.3464781
4,0.3514332


# Network Analysis

In [50]:
network_mm_gs_df <- read_tsv(paste0(dirs$analysis_dir, "/network/", unified_dsets[dset_idx], "_gene_mm_gs.tsv"))
network_me_sig_df <- read_tsv(paste0(dirs$analysis_dir, "/network/", unified_dsets[dset_idx], "_eigengene_traits.tsv"))
lnames <- load(file = paste0(dirs$data_dir, "/saved_network_objects/", unified_dsets[dset_idx], "_tumor_data.RData"))
lnames <- load(file = paste0(dirs$data_dir, "/saved_network_objects/", unified_dsets[dset_idx], "_tumor_network.RData"))
pval_thresh <- 0.05

hub_df <- get_most_conn_genes(data_expr, module_colors, soft_power, conn_vs_hub_thresh = 0.5) %>%
    bind_rows(.id = "module") %>%
    dplyr::select(geneID, everything())



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  geneID = [31mcol_character()[39m,
  module = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  module = [31mcol_character()[39m,
  figo_stage_1_cor = [32mcol_double()[39m,
  figo_stage_2_cor = [32mcol_double()[39m,
  figo_stage_3_cor = [32mcol_double()[39m,
  figo_stage_4_cor = [32mcol_double()[39m,
  figo_stage_1_pval = [32mcol_double()[39m,
  figo_stage_2_pval = [32mcol_double()[39m,
  figo_stage_3_pval

In [51]:
condensed_me_df <- network_me_sig_df %>%
    condense_figo(include_pvals = TRUE) %>%
    dplyr::rename_if(!startsWith(colnames(.), "module"), ~ gsub("^", "me_", .))

filtered_figo_network_df <- network_mm_gs_df %>%
    dplyr::select(geneID, module, mm_pval, mm_cor) %>%
    inner_join(condensed_me_df, by = "module") %>%
    dplyr::filter(me_figo_min_qval < pval_thresh) %>%
    # Make sure genes are significant members of the module
    dplyr::filter(mm_pval < pval_thresh) %>%
    # Make sure genes are highly connected within the module
    dplyr::filter(geneID %in% hub_df$geneID)

filtered_coxph_network_df <- network_mm_gs_df %>%
    dplyr::select(geneID, module, mm_pval, mm_cor) %>%
    inner_join(condensed_me_df, by = "module") %>%
    dplyr::filter(me_vital_qval < pval_thresh) %>%
    # Make sure genes are significant members of the module
    dplyr::filter(mm_pval < pval_thresh) %>%
    # Make sure genes are highly connected within the module
    dplyr::filter(geneID %in% hub_df$geneID)

In [52]:
nrow(filtered_figo_network_df)
nrow(filtered_coxph_network_df)
length(unique(filtered_figo_network_df$module))
length(unique(filtered_coxph_network_df$module))