In [1]:
library(tidyverse)
library(survival)
library(survminer)
library(WGCNA)

# Custom package
library(rutils)

# Define constants
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

event_code <- list("Alive" = 0, "Dead" = 1)
# covariate_cols <- c("figo_stage", "age_at_diagnosis", "race", "ethnicity")
dep_cols <- c("vital_status", "survival_time")

-- [1mAttaching packages[22m ------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.6     [32mv[39m [34mdplyr  [39m 1.0.4
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ---------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: ggpubr

Loading required package: dynamicTreeCut

Loading required package: fastcluster


Attaching package: 'fastcluster'


The following object is masked from 'package:stats':

    hclust





Attaching package: 'WGCNA'


The following object is ma

In [2]:
dset_idx <- 1

In [3]:
# Load and filter survival data
survival_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)
umsmg_demg_list <- read_lines(paste0(dirs$analysis_dir, "/gene_lists/", unified_dsets[dset_idx], "_umsmg_demg_list.txt"))
cutoff_df <- read_tsv(paste0(dirs$analysis_dir, "/survival/", unified_dsets[dset_idx], "_expression_cutoffs.tsv"))

filtered_survival_df <- survival_df %>%
#     decode_figo_stage(to = "c") %>%
#     dplyr::select(one_of(c("sample_name", dep_cols, covariate_cols))) %>%
    dplyr::select(one_of(c("sample_name", dep_cols))) %>%
    dplyr::filter(rowSums(is.na(.)) == 0)

# Load normalized matrisome count data
norm_matrisome_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv"))
norm_matrisome_counts_t_df <- norm_matrisome_counts_df %>%
    dplyr::select(c("geneID", filtered_survival_df$sample_name)) %>%
    transpose_df("geneID", "sample_name")
# Combine survival data and normalized count data
filtered_joined_df <- filtered_survival_df %>%
    inner_join(norm_matrisome_counts_t_df, by = "sample_name") %>%
    select(one_of("sample_name", "vital_status", "survival_time", umsmg_demg_list))


[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  sample_name = [31mcol_character()[39m,
  vital_status = [31mcol_character()[39m,
  survival_time = [32mcol_double()[39m,
  figo_stage = [31mcol_character()[39m,
  days_to_last_follow_up = [32mcol_double()[39m,
  days_to_death = [32mcol_double()[39m,
  age_at_diagnosis = [32mcol_double()[39m,
  age_at_index = [32mcol_double()[39m,
  height = [32mcol_double()[39m,
  weight = [32mcol_double()[39m,
  bmi = [32mcol_double()[39m,
  race = [31mcol_character()[39m,
  ethnicity = [31mcol_character()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  geneID = [31mcol_character()[39m,
  cutoff = [32mcol_double()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-----

In [4]:
get_high_low <- function(df, col_str, cutoff) {
    col = as.name(col_str)
    df %>%
        mutate(high = !!col > cutoff, "high_low" = ifelse(high == TRUE, "high", "low")) %>%
        select(-high)
}

test_all_genes_km <- function(count_df, cutoff_df, gene_names) {
    n_genes <- length(gene_names)
    pvals <- rep(1, n_genes)
    for (i in seq_len(n_genes)) {
        gene_i <- gene_names[i]
        simp_df <- count_df %>%
            select(sample_name, vital_status, survival_time, !!as.name(gene_i))
        cutoff <- cutoff_df %>%
            filter(geneID == gene_i) %>%
            pull(cutoff)
        simp_df <- get_high_low(simp_df, gene_i, cutoff)
        km_fit <- survfit(Surv(survival_time, vital_status) ~ high_low, type = "kaplan-meier", data = simp_df)
        km_diff <- survdiff(Surv(survival_time, vital_status) ~ high_low, data = simp_df)
        pvals[i] <- pchisq(km_diff$chisq, length(km_diff$n) - 1, lower.tail = FALSE)
    }
    tibble(geneID = gene_names, km_pval = pvals, km_qval = WGCNA::qvalue(km_pval)$qvalues)
}

test_all_genes_cph <- function(count_df, gene_names) {
    n_genes <- length(gene_names)
    pvals <- rep(1, n_genes)
    for (i in seq_len(n_genes)) {
        gene_i <- gene_names[i]
        cph_fit <- coxph(as.formula(paste0("Surv(survival_time, vital_status) ~ ", gene_i)), data = count_df)
        pvals[i] <- summary(cph_fit)$logtest["pvalue"]
    }
    tibble(geneID = gene_names, cph_pval = pvals, cph_qval = WGCNA::qvalue(cph_pval)$qvalues)
}

In [5]:
gene_names <- colnames(filtered_joined_df)[-(1:3)]
km_df <- test_all_genes_km(filtered_joined_df, cutoff_df, gene_names)
cph_df <- test_all_genes_cph(filtered_joined_df, gene_names)
joined_surv_df <- km_df %>%
    inner_join(cph_df, by = "geneID")

"Loglik converged before variable  1 ; coefficient may be infinite. "


In [6]:
joined_surv_df %>%
#     filter(cph_pval < 0.1)
#     filter(km_pval < 0.1)
    filter(km_pval < 0.05 | cph_pval < 0.05)

geneID,km_pval,km_qval,cph_pval,cph_qval
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
PRL,0.030190785,0.3282728,0.093182843,0.256766
CCL7,0.029842608,0.3282728,0.084488439,0.256766
REG3A,0.048818537,0.3317609,0.007443063,0.1431721
PCOLCE2,0.087378566,0.4568378,0.04094553,0.2028634
COL6A5,0.004378179,0.2380257,0.021296912,0.1564901
TNFSF11,0.171469936,0.4568378,0.041411854,0.2028634
ANGPTL2,0.016428826,0.2977253,0.003721132,0.1431721
SERPIND1,0.135949522,0.4568378,0.01298963,0.1431721
SERPINA3,0.040719767,0.3317609,0.46693579,0.472898
LGI2,0.010475725,0.2847636,0.018170458,0.1564901


In [7]:
write_tsv(joined_surv_df, paste0(dirs$analysis_dir, "/survival/", unified_dsets[dset_idx], "_univ_survival_results.tsv"))

In [8]:
# g = "COL6A5"
# simp_survival_df <- filtered_joined_df %>%
#     select(sample_name, vital_status, survival_time, !!as.name(g))
# center <- cutoff_df %>%
#     filter(geneID == g) %>%
#     pull(cutoff)
# simp_survival_df <- get_high_low(simp_survival_df, g, center)

# km_fit <- survfit(Surv(survival_time, vital_status) ~ high_low, type = "kaplan-meier", data = simp_survival_df)
# km_diff <- survdiff(Surv(survival_time, vital_status) ~ high_low, data = simp_survival_df)
# km_diff
# ggplot(data = simp_survival_df) +
#     geom_histogram(aes_string(x = g), binwidth = 0.2, fill = "steelblue") +
#     geom_vline(xintercept = center, color = "firebrick", linetype = 2) +
#     theme_bw()

# ggsurvplot(survfit(Surv(survival_time, vital_status) ~ high_low, data = simp_survival_df), data = simp_survival_df)