In [1]:
library(tidyverse)
library(survival)
library(survminer)
library(DESeq2)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: ggpubr
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, 

In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_list <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
i <- 1

In [4]:
sig_deg_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[i], "_sig_DESeq_results_xref_matrisome.tsv"))
matrisome_sig_deg_df <- sig_deg_df %>%
    dplyr::filter(in_matrisome == TRUE)
coldata_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/coldata.tsv"))
matrisome_df <- rutils::load_matrisome_df(matrisome_list)

Parsed with column specification:
cols(
  geneID = col_character(),
  baseMean = col_double(),
  log2FoldChange = col_double(),
  lfcSE = col_double(),
  stat = col_double(),
  pvalue = col_double(),
  padj = col_double(),
  in_matrisome = col_logical(),
  division = col_character(),
  category = col_character(),
  gene_name = col_character(),
  synonyms = col_character(),
  hgnc_ids = col_double(),
  hgnc_ids_links = col_double(),
  uniprot_ids = col_character(),
  refseq_ids = col_character(),
  orthology = col_character(),
  notes = col_character()
)
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_

# Load and filter survival data

In [5]:
# event codes defined according to survival::Surv() docs
event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols <- c("age_at_diagnosis", "bmi", "race", "ethnicity")
dep_cols <- c("vital_status", "survival_time")

survival_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/survival_data.tsv")) %>%
    mutate(vital_status_num = case_when(
        vital_status == "Dead" ~ event_code[["Dead"]],
        vital_status == "Alive" ~ event_code[["Alive"]]
    )) %>%
    dplyr::select(sample_name, vital_status_num, everything(), -vital_status) %>%
    dplyr::rename(vital_status = vital_status_num)
survival_cols <- colnames(survival_df)

filtered_survival_df <- survival_df %>%
    dplyr::select(one_of(c("sample_name", dep_cols, covariate_cols))) %>%
    dplyr::filter(rowSums(is.na(.)) == 0)

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)


In [6]:
nrow(filtered_survival_df)
# proportion of samples included in final data set
nrow(filtered_survival_df) / nrow(survival_df)

# Load, filter, and normalize count data

In [8]:
counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/counts.tsv")) %>%
    dplyr::rename(geneID = Hugo_Symbol) %>%
    dplyr::select(-Entrez_Gene_Id) %>%
    dplyr::mutate_if(is.numeric, round, 0) %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)    # Only care about matrisome genes


# Match up columns of counts with rows of survival data & only include samples present in survival data
filtered_survival_counts_df <- counts_df[, c("geneID", filtered_survival_df$sample_name)] %>%
    dplyr::filter(rowSums(.[, -1]) > 0)    # Ignore genes which are unexpressed in this group

norm_filtered_survival_counts <- varianceStabilizingTransformation(
    as.matrix(filtered_survival_counts_df[, -1])
)
rownames(norm_filtered_survival_counts) <- filtered_survival_counts_df$geneID

Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
converting counts to integer mode


In [9]:
# Sample-wise gene counts should match the order of samples 
all(rownames(t(norm_filtered_survival_counts)) == filtered_survival_df$sample_name)

# Combine filtered survival data and normalized count data

In [10]:
joined_survival_counts_df <- filtered_survival_df %>%
    inner_join(
        as_tibble(t(norm_filtered_survival_counts), rownames = "sample_name"),
        by = "sample_name"
    )

# Some genes contain the '-' symbol, which affects formulae
colnames(joined_survival_counts_df) <- gsub("-", "_", colnames(joined_survival_counts_df))

# Test significance of including a gene

In [19]:
genes_of_interest <- colnames(joined_survival_counts_df %>% dplyr::select(-colnames(filtered_survival_df)))
null_model_formula_chr <- paste0(
    "Surv(survival_time, vital_status) ~ ",
    paste(covariate_cols, collapse = " + ")
)
survival_fit_null <- coxph(
    as.formula(null_model_formula_chr),
    data = joined_survival_counts_df,
    singular.ok = TRUE
)

gene_pvals <- c()
gene_coeffs <- c()

In [20]:
for (g in genes_of_interest) {
    gene_model_formula_chr <- paste0(null_model_formula_chr, " + ", g)
    survival_fit_gene <- coxph(
        as.formula(gene_model_formula_chr),
        data = joined_survival_counts_df,
        singular.ok = TRUE
    )
    anova_res <- anova(survival_fit_null, survival_fit_gene, test = "LRT")
    gene_pvals <- c(gene_pvals, anova_res[["P(>|Chi|)"]][2])
    gene_coeffs <- c(gene_coeffs, survival_fit_gene$coefficients[[g]])
}

“Loglik converged before variable  10 ; coefficient may be infinite. ”

In [21]:
# Re-sub '-' for '_' now that no longer needed for formulae
cox_regression_df <- tibble("geneID" = gsub("_", "-", genes_of_interest), "gene_pval" = gene_pvals, "gene_coeff" = gene_coeffs)
sig_cox_regression_df <- cox_regression_df %>%
    dplyr::filter(gene_pval < 0.05)

In [23]:
matrisome_df %>%
    dplyr::filter(gene_symbol %in% intersect(matrisome_sig_deg_df$geneID, sig_cox_regression_df$geneID))

gene_symbol,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
COLQ,Core matrisome,ECM Glycoproteins,collagen-like tail subunit (single strand of homotrimer) of asymmetric acetylcholinesterase,EAD|FLJ55041,2226,2226,C9JBB2:F8WA67:Q9Y215:S4R408,NP_005668.2:NP_536799.1:NP_536800.2,Mouse:Colq|,Basement Membrane
EMILIN3,Core matrisome,ECM Glycoproteins,elastin microfibril interfacer 3,C20orf130|DKFZp434A2410|EMILIN5|dJ620E11.4,16123,16123,Q9NT22,NP_443078.1,Mouse:Emilin3|,
LAMC2,Core matrisome,ECM Glycoproteins,"laminin, gamma 2",B2T|BM600|CSF|EBR2|EBR2A|LAMB2T|LAMNB2|MGC138491|M,6493,6493,Q13753,NP_005553.2:NP_061486.2,Mouse:Lamc2|,"Laminin, Basement Membrane"
MATN4,Core matrisome,ECM Glycoproteins,matrilin 4,FLJ14417|HE6WCR54,6910,6910,A6NNA4:O95460,NP_085080.1:NP_085095.1:XP_005260654.1:NP_085095.1,Mouse:Matn4|,
MMRN2,Core matrisome,ECM Glycoproteins,multimerin 2,EMILIN3|EndoGlyx-1|FLJ13465,19888,19888,Q9H8L6:R4GMY6:V9GY37:V9GY43:V9GYS9,NP_079032.2,Mouse:Mmrn2|,
NID1,Core matrisome,ECM Glycoproteins,nidogen 1,NID,7821,7821,P14543,NP_002499.2,Mouse:Nid1|,Basement Membrane
NTN5,Core matrisome,ECM Glycoproteins,netrin 5,-,25208,25208,M0QXZ9:Q8WTR8,NP_665806.1:XP_006723074.1,Mouse:Ntn5|,
NTNG2,Core matrisome,ECM Glycoproteins,netrin G2,KIAA0625|KIAA1857|LHLL9381|Lmnt2|MGC21884|NTNG1|bA,14288,14288,A6NMX7:Q5JUJ3:Q96CW9,NP_115925.2,Mouse:Ntng2|,
RELN,Core matrisome,ECM Glycoproteins,reelin,PRO1598|RL,9957,9957,H7C2B0:J3KQ66:P78509,NP_005036.2:NP_774959.1,Mouse:Reln|,
SPP1,Core matrisome,ECM Glycoproteins,secreted phosphoprotein 1,BNSP|BSPI|ETA-1|MGC110940|OPN,11255,11255,D6R9C5:P10451,NP_000573.1:NP_001035147.1:NP_001035149.1:NP_001238758.1:NP_001238759.1,Mouse:Spp1|,
