In [1]:
library(tidyverse)
library(survival)
library(survminer)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: ggpubr


In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_list <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
i <- 1

# Load and filter survival data

In [4]:
# event codes defined according to survival::Surv() docs
event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols <- c("age_at_diagnosis", "bmi", "race", "ethnicity")
dep_cols <- c("vital_status", "survival_time")

survival_df <- load_survival_df(paste0(dirs$data_dir, "/", unified_dsets[i], "/survival_data.tsv"), event_code)

filtered_survival_df <- survival_df %>%
    dplyr::select(one_of(c("sample_name", dep_cols, covariate_cols))) %>%
    dplyr::filter(rowSums(is.na(.)) == 0)

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)


In [5]:
nrow(filtered_survival_df)
# proportion of samples included in final data set
nrow(filtered_survival_df) / nrow(survival_df)

# Load normalized matrisome count data

In [6]:
norm_matrisome_counts <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/norm_matrisome_counts.tsv")) %>%
    column_to_rownames(var = "geneID") %>%
    as.matrix()

# Match up columns of counts with rows of survival data & only include samples present in survival data
norm_matrisome_survival_counts <- norm_matrisome_counts[, filtered_survival_df$sample_name]

Parsed with column specification:
cols(
  .default = col_double(),
  geneID = col_character()
)
See spec(...) for full column specifications.


In [7]:
all(rownames(t(norm_matrisome_survival_counts)) == filtered_survival_df$sample_name)

# Combine filtered survival data and normalized count data

In [8]:
joined_survival_counts_df <- filtered_survival_df %>%
    inner_join(
        as_tibble(t(norm_matrisome_survival_counts), rownames = "sample_name"),
        by = "sample_name"
    )

# Some genes contain the '-' symbol, which affects formulae
colnames(joined_survival_counts_df) <- gsub("-", "_", colnames(joined_survival_counts_df))

## Cox PH model

In [9]:
genes_of_interest <- colnames(joined_survival_counts_df %>% dplyr::select(-colnames(filtered_survival_df)))
null_model_formula_chr <- paste0(
    "Surv(survival_time, vital_status) ~ ",
    paste(covariate_cols, collapse = " + ")
)
survival_fit_null <- coxph(
    as.formula(null_model_formula_chr),
    data = joined_survival_counts_df,
    singular.ok = TRUE
)

gene_pvals <- c()
gene_coeffs <- c()

In [10]:
for (g in genes_of_interest) {
    gene_model_formula_chr <- paste0(null_model_formula_chr, " + ", g)
    survival_fit_gene <- coxph(
        as.formula(gene_model_formula_chr),
        data = joined_survival_counts_df,
        singular.ok = TRUE
    )
    anova_res <- anova(survival_fit_null, survival_fit_gene, test = "LRT")
    gene_pvals <- c(gene_pvals, anova_res[["P(>|Chi|)"]][2])
    gene_coeffs <- c(gene_coeffs, survival_fit_gene$coefficients[[g]])
}

“Loglik converged before variable  10 ; coefficient may be infinite. ”

In [12]:
# Re-sub '-' for '_' now that no longer needed for formulae
cox_regression_df <- tibble("geneID" = gsub("_", "-", genes_of_interest), "gene_pval" = gene_pvals, "gene_coeff" = gene_coeffs)
sig_cox_regression_df <- cox_regression_df %>%
    dplyr::filter(gene_pval < 0.05)
nrow(sig_cox_regression_df)
write_tsv(cox_regression_df, paste0(dirs$analysis_dir, "/", unified_dsets[i], "_coxph_results.tsv"))