In [18]:
library(tidyverse)
library(matrixStats)
library(WGCNA)

# Custom package
library(rutils)

Loading required package: dynamicTreeCut
Loading required package: fastcluster

Attaching package: ‘fastcluster’

The following object is masked from ‘package:stats’:

    hclust



Attaching package: ‘WGCNA’

The following object is masked from ‘package:stats’:

    cor



In [2]:
# Define constants and load data
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")

event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols_no_figo <- c("age_at_diagnosis", "race", "ethnicity")
covariate_cols <- c("figo_stage", covariate_cols_no_figo)
dep_cols <- c("vital_status", "survival_time")

In [34]:
dset_idx <- 2

In [35]:
survival_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)
filtered_survival_df <- survival_df %>%
    dplyr::select(sample_name, vital_status, survival_time) %>%
    dplyr::filter(rowSums(is.na(.)) == 0)

# Load normalized matrisome count data
norm_matrisome_counts_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv")
norm_survival_counts_df <- read_tsv(norm_matrisome_counts_path) %>%
    dplyr::select(c("geneID", filtered_survival_df$sample_name)) %>%
    # Want to drop genes which have variance near zero
    dplyr::mutate(var = rowVars(as.matrix(.[-1]))) %>%
    dplyr::select(geneID, var, everything()) %>%
    dplyr::filter(var > 1e-10)
norm_survival_counts_t_df <- norm_survival_counts_df %>%
    column_to_rownames(var = "geneID") %>%
    t() %>%
    as_tibble(rownames = "sample_name") 

joined_df <- norm_survival_counts_t_df %>%
    inner_join(filtered_survival_df, by = "sample_name") %>%
    dplyr::select(sample_name, vital_status, everything()) %>%
    dplyr::mutate(vital_status = factor(vital_status, levels=c('1', '0')))

# Some genes contain the '-' symbol, which affects formulae
colnames(joined_df) <- gsub("-", "_", colnames(joined_df))

# Perform Welch ANOVA for each gene and adjust p-values
gene_names <- colnames(joined_df[-c(1:2)])

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)
Parsed with column specification:
cols(
  .default = col_double(),
  geneID = col_character()
)
See spec(...) for full column specifications.


In [36]:
welch_t_df <- colwise_t_test(joined_df, "vital_status", gene_names, "geneID") %>%
    na.omit() %>%
    dplyr::mutate(qval = WGCNA::qvalue(pval)$qvalues)

In [37]:
welch_t_df %>%
    dplyr::filter(qval < 0.05)

geneID,pval,padj,qval
<chr>,<dbl>,<dbl>,<dbl>
