In [10]:
library(tidyverse)
library(survival)
library(survminer)

# Custom package
library(rutils)

In [11]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")

In [12]:
i <- 1

In [14]:
survival_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/survival_data.tsv")) %>%
    mutate(vital_status_num = case_when(
        vital_status == "Dead" ~ 1,
        vital_status == "Alive" ~ 0
    )) %>%
    dplyr::select(sample_name, vital_status_num, everything(), -vital_status) %>%
    dplyr::rename(vital_status = vital_status_num) %>%
    dplyr::filter(!is.na(survival_time), !is.na(vital_status))
head(survival_df)
nrow(survival_df)

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  survival_time = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race_american_indian_or_alaska_native = col_double(),
  race_asian = col_double(),
  race_black_or_african_american = col_double(),
  race_native_hawaiian_or_other_pacific_islander = col_double(),
  race_not_reported = col_double(),
  race_white = col_double(),
  ethnicity_hispanic_or_latino = col_double(),
  ethnicity_not_hispanic_or_latino = col_double(),
  ethnicity_not_reported = col_double()
)


sample_name,vital_status,days_to_last_follow_up,days_to_death,survival_time,age_at_diagnosis,age_at_index,height,weight,bmi,race_american_indian_or_alaska_native,race_asian,race_black_or_african_american,race_native_hawaiian_or_other_pacific_islander,race_not_reported,race_white,ethnicity_hispanic_or_latino,ethnicity_not_hispanic_or_latino,ethnicity_not_reported
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
TCGA-C5-A1BF-01B-11R-A13Y-07,1,,570.0,570,16975,46,,76.0,,0,0,0,0,0,1,0,0,1
TCGA-MU-A51Y-01A-11R-A26T-07,0,854.0,,854,10199,27,,31.0,,0,0,0,0,0,1,0,0,1
TCGA-EK-A2RM-01A-21R-A18M-07,0,50.0,,50,14842,40,167.0,105.0,37.64925,0,0,0,0,0,1,0,1,0
TCGA-Q1-A73P-01A-11R-A32P-07,0,483.0,,483,16450,45,173.0,82.0,27.39818,0,0,0,0,0,1,0,1,0
TCGA-C5-A8YT-01A-11R-A37O-07,1,186.0,633.0,633,13253,36,,,,0,0,0,0,0,1,0,1,0
TCGA-C5-A2LZ-01A-11R-A213-07,1,,3046.0,3046,24059,65,163.0,85.0,31.99217,0,0,0,0,0,1,0,1,0


# Test relevance of including a gene

In [42]:
filtered_survival_df <- survival_df %>%
    dplyr::select(sample_name, vital_status, survival_time, age_at_diagnosis) %>%
    # For now, drop rows with NA -- rethink this later as expand number of features
    dplyr::filter(rowSums(is.na(.)) == 0)
    

In [120]:
counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[i], "/counts.tsv")) %>%
    dplyr::rename(geneID = Hugo_Symbol) %>%
    dplyr::select(-Entrez_Gene_Id) %>%
    dplyr::mutate_if(is.numeric, round, 0)

# Match up columns with rows of survival data & only include samples present in survival data
survival_counts_df <- counts_df[, c("geneID", filtered_survival_df$sample_name)]
survival_counts <- as.matrix(survival_counts_df %>% column_to_rownames(var = "geneID"))

# A row of survival_counts should be usable as a column in filtered_survival_df
all(colnames(survival_counts) == filtered_survival_df$sample_name)

Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.


In [146]:
# survival_fit_null <- coxph(Surv(survival_time, vital_status) ~ age_at_diagnosis, data = filtered_survival_df)
null_model_formula <- as.formula(paste0("Surv(survival_time, vital_status) ~ ", "age_at_diagnosis"))
survival_fit_null <- coxph(null_model_formula, data = filtered_survival_df)

gene_model_formula <- as.formula(paste0("Surv(survival_time, vital_status) ~ ", "age_at_diagnosis", "+", "gene"))
survival_fit_gene <- coxph(gene_model_formula, data = filtered_survival_df %>%dplyr::mutate(gene = survival_counts[2008, ]))

survival_fit_null
survival_fit_gene
anova(survival_fit_null, survival_fit_gene, test = "LRT")

Call:
coxph(formula = null_model_formula, data = filtered_survival_df)

                      coef exp(coef)  se(coef)     z      p
age_at_diagnosis 4.734e-05 1.000e+00 2.468e-05 1.918 0.0551

Likelihood ratio test=3.61  on 1 df, p=0.05749
n= 258, number of events= 66 

Call:
coxph(formula = gene_model_formula, data = filtered_survival_df %>% 
    dplyr::mutate(gene = survival_counts[2008, ]))

                       coef  exp(coef)   se(coef)      z      p
age_at_diagnosis  4.439e-05  1.000e+00  2.464e-05  1.802 0.0716
gene             -3.306e-04  9.997e-01  1.615e-04 -2.047 0.0407

Likelihood ratio test=8.58  on 2 df, p=0.0137
n= 258, number of events= 66 

Unnamed: 0_level_0,loglik,Chisq,Df,P(>|Chi|)
Unnamed: 0_level_1,<dbl>,<dbl>,<int>,<dbl>
1,-310.0642,,,
2,-307.5778,4.972736,1.0,0.02574989
