In [1]:
library(tidyverse)
library(survival)
library(survminer)
library(DESeq2)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: ggpubr
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, 

In [None]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_list <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [None]:
i <- 1

In [None]:
sig_deg_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[i], "_sig_DESeq_results_xref_matrisome.tsv"))
matrisome_sig_deg_df <- sig_deg_df %>%
    dplyr::filter(in_matrisome == TRUE)
coldata_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/coldata.tsv"))

In [None]:
matrisome_df <- rutils::load_matrisome_df(matrisome_list)
survival_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/survival_data.tsv")) %>%
    mutate(vital_status_num = case_when(
        vital_status == "Dead" ~ 1,
        vital_status == "Alive" ~ 0
    )) %>%
    dplyr::select(sample_name, vital_status_num, everything(), -vital_status) %>%
    dplyr::rename(vital_status = vital_status_num) %>%
    dplyr::filter(!is.na(survival_time), !is.na(vital_status))
survival_cols <- colnames(survival_df)
head(survival_df)
nrow(survival_df)

In [None]:
race_cols <- survival_cols[grepl("^race_", survival_cols)]
ethnicity_cols <- survival_cols[grepl("^ethnicity_", survival_cols)]
covariate_cols <- c("age_at_diagnosis", "bmi", race_cols, ethnicity_cols)
dep_cols <- c("vital_status", "survival_time")
filtered_survival_df <- survival_df %>%
    dplyr::select(one_of(c("sample_name", dep_cols, covariate_cols))) %>%
    # For now, drop rows with NA -- rethink this later as expand number of features
    dplyr::filter(rowSums(is.na(.)) == 0)

In [None]:
counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/counts.tsv")) %>%
    dplyr::rename(geneID = Hugo_Symbol) %>%
    dplyr::select(-Entrez_Gene_Id) %>%
    dplyr::mutate_if(is.numeric, round, 0) %>%
    dplyr::filter(geneID %in% matrisome_df$gene_symbol)    # Only care about matrisome genes

# Match up columns of counts with rows of survival data & only include samples present in survival data
survival_counts_df <- counts_df[, c("geneID", filtered_survival_df$sample_name)] %>%
    dplyr::filter(rowSums(.[, -1]) > 0)    # Ignore genes which are unexpressed in this group
survival_counts <- as.matrix(survival_counts_df %>% column_to_rownames(var = "geneID"))
norm_survival_counts <- varianceStabilizingTransformation(as.matrix(survival_counts_df[, -1]))
rownames(norm_survival_counts) <- survival_counts_df$geneID

genes_of_interest <- rownames(norm_survival_counts)

In [None]:
nrow(filtered_survival_df)
nrow(filtered_survival_df) / nrow(survival_df)
# A row of survival_counts should be usable as a column in filtered_survival_df
all(colnames(survival_counts) == filtered_survival_df$sample_name)

# Test significance of including a gene

In [None]:
# singular.ok = TRUE because race & ethnicity are one-hot encodings, so
# some columns are certain to be linear combinations of others
null_model_formula_chr <- paste0("Surv(survival_time, vital_status) ~ ", paste(covariate_cols, collapse = " + "))
survival_fit_null <- coxph(as.formula(null_model_formula_chr), data = filtered_survival_df, singular.ok = TRUE)
gene_model_formula_chr <- paste0(null_model_formula_chr, " + gene")
gene_pvals <- c()
gene_coeffs <- c()

for (g in genes_of_interest) {
    survival_fit_gene <- coxph(as.formula(gene_model_formula_chr), data = filtered_survival_df %>% dplyr::mutate(gene = norm_survival_counts[g, ]), singular.ok = TRUE)
    anova_res <- anova(survival_fit_null, survival_fit_gene, test = "LRT")
    gene_pvals <- c(gene_pvals, anova_res[["P(>|Chi|)"]][2])
    gene_coeffs <- c(gene_coeffs, survival_fit_gene$coefficients[["gene"]])
}

In [None]:
cox_regression_df <- tibble("geneID" = genes_of_interest, "gene_pval" = gene_pvals, "gene_coeff" = gene_coeffs)
sig_cox_regression_df <- cox_regression_df %>%
    dplyr::filter(gene_pval < 0.05)

In [None]:
ggplot() +
    geom_histogram(aes(x = gene_pvals, y = ..density..), data = cox_regression_df, binwidth = 0.05, boundary = 0, fill = "steelblue") +
    geom_density(aes(x = gene_pvals), data = cox_regression_df) +
    scale_x_continuous(breaks = seq(from = 0, to = 1, by = 0.1))
ggsave(paste0(dirs$figures_dir, "/", unified_dsets[i], "_predictive_pval_density.png"), last_plot())

In [None]:
length(matrisome_sig_deg_df$geneID)
length(sig_cox_regression_df$geneID)
length(union(matrisome_sig_deg_df$geneID, sig_cox_regression_df$geneID))

In [None]:
hm_df <- counts_df %>%
    dplyr::select(geneID) %>%
    dplyr::mutate(
        deg = ifelse(geneID %in% matrisome_sig_deg_df$geneID, 1, 0),
        pred = ifelse(geneID %in% sig_cox_regression_df$geneID, 1, 0)
    ) %>%
    dplyr::mutate(
        both = ifelse(deg & pred, 1, 0)
    )
nrow(hm_df)
head(hm_df)

In [None]:
hm_pivot_df <- hm_df %>%
    pivot_longer(cols = deg:both, names_to = "test")
hm_pivot_df$test = factor(hm_pivot_df$test, levels = c("deg", "pred", "both"))
hm_pivot_df$value = factor(hm_pivot_df$value, levels = c("0", "1"))

head(hm_pivot_df)
nrow(hm_pivot_df)

In [None]:
ggplot() +
    geom_tile(aes(x = test, y = geneID, fill = value), data = hm_pivot_df, width = 0.95) +
    scale_fill_manual(values = c("white", "black")) +
    theme(
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "None"
    ) +
    coord_equal(ratio = 1/75)
ggsave(paste0(dirs$figures_dir, "/", unified_dsets[i], "_matrisome_DEG_predictive_heatmap.png"), last_plot())

In [None]:
matrisome_df %>%
    dplyr::filter(gene_symbol %in% intersect(matrisome_sig_deg_df$geneID, sig_cox_regression_df$geneID)) %>%
    write_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[i], "_DE_and_predictive.tsv"))