In [1]:
library(tidyverse)
library(survival)
library(survminer)
library(DESeq2)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: ggpubr
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, 

In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_list <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
i <- 1

In [4]:
sig_deg_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[i], "_sig_DESeq_results_xref_matrisome.tsv"))
matrisome_sig_deg_df <- sig_deg_df %>%
    dplyr::filter(in_matrisome == TRUE)
coldata_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/coldata.tsv"))
matrisome_df <- rutils::load_matrisome_df(matrisome_list)

Parsed with column specification:
cols(
  geneID = col_character(),
  baseMean = col_double(),
  log2FoldChange = col_double(),
  lfcSE = col_double(),
  stat = col_double(),
  pvalue = col_double(),
  padj = col_double(),
  in_matrisome = col_logical(),
  division = col_character(),
  category = col_character(),
  gene_name = col_character(),
  synonyms = col_character(),
  hgnc_ids = col_double(),
  hgnc_ids_links = col_double(),
  uniprot_ids = col_character(),
  refseq_ids = col_character(),
  orthology = col_character(),
  notes = col_character()
)
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_

# Load and filter survival data

In [5]:
# event codes defined according to survival::Surv() docs
event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols <- c("age_at_diagnosis", "bmi", "race", "ethnicity")
dep_cols <- c("vital_status", "survival_time")

survival_df <- load_survival_df(paste0(dirs$data_dir, "/", unified_dsets[i], "/survival_data.tsv"), event_code)

filtered_survival_df <- survival_df %>%
    dplyr::select(one_of(c("sample_name", dep_cols, covariate_cols))) %>%
    dplyr::filter(rowSums(is.na(.)) == 0)

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)


In [6]:
nrow(filtered_survival_df)
# proportion of samples included in final data set
nrow(filtered_survival_df) / nrow(survival_df)

# Load, filter, and normalize count data

In [7]:
counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/counts.tsv")) %>%
    dplyr::rename(geneID = Hugo_Symbol) %>%
    dplyr::select(-Entrez_Gene_Id) %>%
    dplyr::mutate_if(is.numeric, round, 0) %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)    # Only care about matrisome genes


# Match up columns of counts with rows of survival data & only include samples present in survival data
filtered_survival_counts_df <- counts_df[, c("geneID", filtered_survival_df$sample_name)] %>%
    dplyr::filter(rowSums(.[, -1]) > 0)    # Ignore genes which are unexpressed in this group

norm_filtered_survival_counts <- varianceStabilizingTransformation(
    as.matrix(filtered_survival_counts_df[, -1])
)
rownames(norm_filtered_survival_counts) <- filtered_survival_counts_df$geneID

Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
converting counts to integer mode


In [8]:
# Sample-wise gene counts should match the order of samples 
all(rownames(t(norm_filtered_survival_counts)) == filtered_survival_df$sample_name)

# Combine filtered survival data and normalized count data

In [9]:
joined_survival_counts_df <- filtered_survival_df %>%
    inner_join(
        as_tibble(t(norm_filtered_survival_counts), rownames = "sample_name"),
        by = "sample_name"
    )

# Some genes contain the '-' symbol, which affects formulae
colnames(joined_survival_counts_df) <- gsub("-", "_", colnames(joined_survival_counts_df))

# Test significance of including a gene

## Cox PH model

In [11]:
genes_of_interest <- colnames(joined_survival_counts_df %>% dplyr::select(-colnames(filtered_survival_df)))
null_model_formula_chr <- paste0(
    "Surv(survival_time, vital_status) ~ ",
    paste(covariate_cols, collapse = " + ")
)
survival_fit_null <- coxph(
    as.formula(null_model_formula_chr),
    data = joined_survival_counts_df,
    singular.ok = TRUE
)

gene_pvals <- c()
gene_coeffs <- c()

In [12]:
for (g in genes_of_interest) {
    gene_model_formula_chr <- paste0(null_model_formula_chr, " + ", g)
    survival_fit_gene <- coxph(
        as.formula(gene_model_formula_chr),
        data = joined_survival_counts_df,
        singular.ok = TRUE
    )
    anova_res <- anova(survival_fit_null, survival_fit_gene, test = "LRT")
    gene_pvals <- c(gene_pvals, anova_res[["P(>|Chi|)"]][2])
    gene_coeffs <- c(gene_coeffs, survival_fit_gene$coefficients[[g]])
}

“Loglik converged before variable  10 ; coefficient may be infinite. ”

In [13]:
# Re-sub '-' for '_' now that no longer needed for formulae
cox_regression_df <- tibble("geneID" = gsub("_", "-", genes_of_interest), "gene_pval" = gene_pvals, "gene_coeff" = gene_coeffs)
sig_cox_regression_df <- cox_regression_df %>%
    dplyr::filter(gene_pval < 0.05)
write_tsv(sig_cox_regression_df, paste0(dirs$analysis_dir, "/", unified_dsets[i], "_sig_coxph_results.tsv"))