In [1]:
library(tidyverse)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols <- c("figo_stage", "age_at_diagnosis", "race", "ethnicity")
dep_cols <- c("vital_status", "survival_time")

In [3]:
dset_idx <- 1

# Load and filter data

In [4]:
survival_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)

filtered_survival_df <- survival_df %>%
    decode_figo_stage(to = "c") %>%
    dplyr::select(one_of(c("sample_name", dep_cols, covariate_cols))) %>%
    dplyr::filter(rowSums(is.na(.)) == 0) %>%
    dplyr::select(sample_name, vital_status) %>%
    dplyr::mutate(vital_status = as.character(vital_status))

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)


# Load count data

In [5]:
# Load normalized matrisome count data
norm_matrisome_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv"))
norm_matrisome_counts_t_df <- norm_matrisome_counts_df %>%
    dplyr::select(c("geneID", filtered_survival_df$sample_name)) %>%
    transpose_df("geneID", "sample_name")

Parsed with column specification:
cols(
  .default = col_double(),
  geneID = col_character()
)
See spec(...) for full column specifications.


# Join

In [7]:
# Combine survival data and normalized count data
joined_df <- filtered_survival_df %>%
    inner_join(norm_matrisome_counts_t_df, by = "sample_name")

# Some genes contain the '-' symbol, which affects formulae
colnames(joined_df) <- gsub("-", "_", colnames(joined_df))

In [8]:
joined_df %>%
    group_by(vital_status) %>%
    summarize(n = n())

`summarise()` ungrouping output (override with `.groups` argument)


vital_status,n
<chr>,<int>
0,189
1,66


In [11]:
gene_names <- colnames(joined_df[-c(1:2)])

In [14]:
waov_df <- colwise_anova(joined_df, "vital_status", gene_names, "geneID", adjust_method = "BH")

In [17]:
waov_df %>%
    dplyr::filter(pval < 0.05)

geneID,pval,padj
<chr>,<dbl>,<dbl>
CCL22,0.033791571,0.4754617
ANGPT2,0.012419709,0.3265076
MUC8,0.001608024,0.1606416
SERPINA5,0.031020217,0.4557235
P4HA3,0.014453688,0.3437913
SLIT2,0.035060538,0.4798011
WNT3A,0.005007938,0.2312716
IFNG,0.030886168,0.4557235
S100A10,0.010346011,0.3012758
ESM1,0.005556075,0.2312716
