In [157]:
library(tidyverse)

# Custom package
library(rutils)

In [158]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [159]:
dset_idx <- 1

# Load and filter survival data

In [160]:
event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols_no_figo <- c("age_at_diagnosis", "race", "ethnicity")
covariate_cols <- c("figo_stage", covariate_cols_no_figo)
dep_cols <- c("vital_status", "survival_time")

In [161]:
survival_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)


In [162]:
filtered_survival_df <- survival_df %>%
    decode_figo_stage(to = "n") %>%
    dplyr::mutate(figo_stage = as.character(figo_stage)) %>%
    dplyr::select(sample_name, figo_stage, race, ethnicity, age_at_diagnosis) %>% # make sure using same samples as classification models
    dplyr::filter(rowSums(is.na(.)) == 0) %>%
    dplyr::select(sample_name, figo_stage) %>%
    bind_cols(to_one_hot(., "figo_stage"))
#     dplyr::select(-figo_stage)

# Load normalized matrisome count data

In [163]:
norm_matrisome_counts_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv")
norm_survival_counts_t_df <- read_tsv(norm_matrisome_counts_path) %>%
    dplyr::select(c("geneID", filtered_survival_df$sample_name)) %>%
    transpose_df("geneID", "sample_name")

Parsed with column specification:
cols(
  .default = col_double(),
  geneID = col_character()
)
See spec(...) for full column specifications.


In [164]:
joined_df <- filtered_survival_df %>%
    inner_join(norm_survival_counts_t_df, by = "sample_name")
#     dplyr::mutate(figo_stage = case_when(
#         figo_stage %in% c("figo_stage_1", "figo_stage_2") ~ "early",
#         figo_stage %in% c("figo_stage_3", "figo_stage_4") ~ "late"
#     ))

In [165]:
joined_df %>%
    group_by(figo_stage) %>%
    summarize(n = n())

`summarise()` ungrouping output (override with `.groups` argument)


figo_stage,n
<chr>,<int>
1,135
2,61
3,40
4,19


In [166]:
# Some genes contain the '-' symbol, which affects formulae
colnames(joined_df) <- gsub("-", "_", colnames(joined_df))

In [167]:
gene_names <- colnames(joined_df[-c(1:6)])
pval_mat <- matrix(0.0, nrow=length(gene_names), ncol=4)
colnames(pval_mat) <- paste0(colnames(filtered_survival_df)[-c(1:2)], "_pval")

In [168]:
for (i in 1:length(gene_names)) {
    g <- gene_names[i]
    formula_strs <- paste0(g, " ~ ", colnames(filtered_survival_df)[-c(1:2)])
    waov_res <- lapply(formula_strs, function(x) { oneway.test(as.formula(x), data = joined_df) })
    waov_pvals <- lapply(waov_res, function(x) { x$p.value })
    pval_mat[i, ] <- unlist(waov_pvals)
}


In [169]:
waov_df <- tibble(gene_names) %>%
    bind_cols(as_tibble(pval_mat))
waov_padj <- apply(waov_df[-1], 2, function(x) { p.adjust(x, method = "BH") })
colnames(waov_padj) <- gsub("pval", "padj", colnames(waov_padj))
waov_df <- waov_df %>%
    bind_cols(as_tibble(waov_padj))

In [170]:
write_tsv(waov_df, paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_welch_anova_one_vs_all_results.tsv"))