In [15]:
library(tidyverse)
library(survival)
library(survminer)
library(ggplot2)

# Custom package
library(rutils)

In [16]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [17]:
dset_idx <- 1

# Load and filter survival data

In [18]:
event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols <- c("figo_stage", "age_at_diagnosis", "race", "ethnicity")
dep_cols <- c("vital_status", "survival_time")
figo_map_df <- tibble(
    roman_num = c("I", "II", "III", "IV"),
    figo_code = c('1', '2', '3', '4')
)

In [19]:
survival_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)
filtered_survival_df <- survival_df %>%
    dplyr::select(one_of(c("sample_name", dep_cols, covariate_cols))) %>%
    dplyr::filter(rowSums(is.na(.)) == 0) %>%
    dplyr::mutate(
        figo_rn = str_extract(figo_stage, "IV|III|II|I")
    ) %>%
    dplyr::inner_join(figo_map_df, by = c("figo_rn" = "roman_num")) %>%
    dplyr::select(-c(figo_rn, figo_stage)) %>%
    dplyr::rename(figo_stage = figo_code)

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)


# Load normalized matrisome count data

In [20]:
norm_matrisome_counts <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv")) %>%
    column_to_rownames(var = "geneID") %>%
    as.matrix()

# Match up columns of counts with rows of survival data & only include samples present in survival data
norm_matrisome_survival_counts <- norm_matrisome_counts[, filtered_survival_df$sample_name]

Parsed with column specification:
cols(
  .default = col_double(),
  geneID = col_character()
)
See spec(...) for full column specifications.


In [21]:
all(rownames(t(norm_matrisome_survival_counts)) == filtered_survival_df$sample_name)

# Combine filtered survival data and normalized count data

In [22]:
joined_survival_counts_df <- filtered_survival_df %>%
    inner_join(
        as_tibble(t(norm_matrisome_survival_counts), rownames = "sample_name"),
        by = "sample_name"
    )
nrow(joined_survival_counts_df)
head(joined_survival_counts_df)

# Some genes contain the '-' symbol, which affects formulae
colnames(joined_survival_counts_df) <- gsub("-", "_", colnames(joined_survival_counts_df))

sample_name,vital_status,survival_time,age_at_diagnosis,race,ethnicity,figo_stage,PGF,TIMP4,C1QTNF6,⋯,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
TCGA-C5-A1BF-01B-11R-A13Y-07,1,570,16975,white,not reported,1,10.800637,6.228003,11.669331,⋯,9.013453,8.190325,9.503647,14.07799,6.569726,7.315604,4.602649,12.0623,5.649441,16.55841
TCGA-MU-A51Y-01A-11R-A26T-07,0,854,10199,white,not reported,2,8.451001,5.851272,10.079584,⋯,12.20841,8.172899,10.515388,16.886,7.38414,7.366949,4.890169,11.011959,5.406776,14.76108
TCGA-EK-A2RM-01A-21R-A18M-07,0,50,14842,white,not hispanic or latino,1,9.674879,7.277164,10.712783,⋯,10.854224,6.581217,8.437154,15.81626,7.644559,6.406766,4.998296,11.731128,6.028879,17.11959
TCGA-Q1-A73P-01A-11R-A32P-07,0,483,16450,white,not hispanic or latino,1,8.036801,5.247645,9.894159,⋯,10.854487,5.629541,9.602922,14.17475,6.987468,6.731154,4.602649,9.293089,4.893018,16.64949
TCGA-C5-A8YT-01A-11R-A37O-07,1,633,13253,white,not hispanic or latino,1,7.830611,5.733875,12.445548,⋯,9.453187,6.398956,12.288955,13.39633,10.228758,8.542025,4.602649,11.765396,5.318924,13.55632
TCGA-C5-A2LZ-01A-11R-A213-07,1,3046,24059,white,not hispanic or latino,3,10.13555,4.865349,8.566221,⋯,11.492135,7.426145,9.480199,15.92715,6.010764,7.405245,4.602649,7.506731,5.423416,14.73802


In [23]:
null_model_formula_chr <- paste0(
    "Surv(survival_time, vital_status) ~ ",
    paste0(covariate_cols, collapse = " + ")
)
cox_fit_null <- coxph(
    as.formula(null_model_formula_chr),
    data = joined_survival_counts_df,
    singular.ok = TRUE
)
summary(cox_fit_null)

Call:
coxph(formula = as.formula(null_model_formula_chr), data = joined_survival_counts_df, 
    singular.ok = TRUE)

  n= 255, number of events= 66 

                                                    coef  exp(coef)   se(coef)
figo_stage2                                   -6.296e-01  5.328e-01  4.097e-01
figo_stage3                                    3.084e-01  1.361e+00  3.768e-01
figo_stage4                                    1.447e+00  4.249e+00  3.601e-01
age_at_diagnosis                               3.332e-05  1.000e+00  2.727e-05
raceasian                                     -2.546e+00  7.839e-02  1.281e+00
raceblack or african american                 -2.570e+00  7.654e-02  1.111e+00
racenative hawaiian or other pacific islander -1.978e-01  8.205e-01  1.456e+00
racenot reported                              -2.542e+00  7.874e-02  1.103e+00
racewhite                                     -2.282e+00  1.020e-01  1.052e+00
ethnicitynot hispanic or latino                1.083e+00  2

In [24]:
genes_of_interest <- colnames(joined_survival_counts_df %>% dplyr::select(-colnames(filtered_survival_df)))
gene_pvals <- c()
gene_coeffs <- c()

for (g in genes_of_interest) {
    gene_model_formula_chr <- paste0(null_model_formula_chr, " + ", g)
    cox_fit_gene <- coxph(
        as.formula(gene_model_formula_chr),
        data = joined_survival_counts_df,
        singular.ok = TRUE
    )
    anova_res <- anova(cox_fit_null, cox_fit_gene, test = "LRT")
    gene_pvals <- c(gene_pvals, anova_res[["P(>|Chi|)"]][2])
    gene_coeffs <- c(gene_coeffs, cox_fit_gene$coefficients[[g]])
}

“Loglik converged before variable  12 ; coefficient may be infinite. ”

In [25]:
# Re-sub '-' for '_' now that no longer needed for formulae
cox_regression_df <- tibble("geneID" = gsub("_", "-", genes_of_interest), "gene_pval" = gene_pvals, "gene_coeff" = gene_coeffs)
sig_cox_regression_df <- cox_regression_df %>%
    dplyr::filter(gene_pval < 0.05)
write_tsv(cox_regression_df, paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_coxph_results.tsv"))

In [26]:
# Num. predictive genes
nrow(sig_cox_regression_df)
# Prop. matrisome genes which are predictive
nrow(sig_cox_regression_df) / nrow(norm_matrisome_survival_counts)

# Interpretation of coefficient sign

Source: https://www.statsdirect.com/help/survival_analysis/cox_regression.htm#:~:text=In%20the%20context%20of%20an,Cox%20regression%20for%20survival%20analysis.&text=The%20coefficients%20in%20a%20Cox,with%20which%20it%20is%20associated.

In [27]:
# Genes associated with negative prognosis
nrow(sig_cox_regression_df %>%
     dplyr::filter(gene_coeff > 0))

# Genes associated with positive prognosis
nrow(sig_cox_regression_df %>%
     dplyr::filter(gene_coeff < 0))