In [2]:
library(tidyverse)
library(survival)

# Custom package
library(rutils)

# Define constants
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols <- c("figo_stage", "age_at_diagnosis", "race", "ethnicity")
dep_cols <- c("vital_status", "survival_time")


In [3]:
dset_idx <- 1

In [5]:
# Load and filter survival data
survival_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)

filtered_survival_df <- survival_df %>%
    decode_figo_stage(to = "c") %>%
    dplyr::select(one_of(c("sample_name", dep_cols, covariate_cols))) %>%
    dplyr::filter(rowSums(is.na(.)) == 0)

# Load normalized matrisome count data
norm_matrisome_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv"))
norm_matrisome_counts_t_df <- norm_matrisome_counts_df %>%
    dplyr::select(c("geneID", filtered_survival_df$sample_name)) %>%
    transpose_df("geneID", "sample_name")
# Combine survival data and normalized count data
joined_survival_counts_df <- filtered_survival_df %>%
    inner_join(norm_matrisome_counts_t_df, by = "sample_name")



[36m--[39m [1m[1mColumn specification[1m[22m [36m----------------------------------------------------------------------------------------------[39m
cols(
  sample_name = [31mcol_character()[39m,
  vital_status = [31mcol_character()[39m,
  survival_time = [32mcol_double()[39m,
  figo_stage = [31mcol_character()[39m,
  days_to_last_follow_up = [32mcol_double()[39m,
  days_to_death = [32mcol_double()[39m,
  age_at_diagnosis = [32mcol_double()[39m,
  age_at_index = [32mcol_double()[39m,
  height = [32mcol_double()[39m,
  weight = [32mcol_double()[39m,
  bmi = [32mcol_double()[39m,
  race = [31mcol_character()[39m,
  ethnicity = [31mcol_character()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m----------------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  geneID = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the ful

In [8]:
head(joined_survival_counts_df)

sample_name,vital_status,survival_time,figo_stage,age_at_diagnosis,race,ethnicity,PGF,TIMP4,C1QTNF6,⋯,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
TCGA-C5-A1BF-01B-11R-A13Y-07,1,570,figo_stage_1,16975,white,not reported,10.800637,6.228003,11.669331,⋯,9.013453,8.190325,9.503647,14.07799,6.569726,7.315604,4.602649,12.0623,5.649441,16.55841
TCGA-MU-A51Y-01A-11R-A26T-07,0,854,figo_stage_2,10199,white,not reported,8.451001,5.851272,10.079584,⋯,12.20841,8.172899,10.515388,16.886,7.38414,7.366949,4.890169,11.011959,5.406776,14.76108
TCGA-EK-A2RM-01A-21R-A18M-07,0,50,figo_stage_1,14842,white,not hispanic or latino,9.674879,7.277164,10.712783,⋯,10.854224,6.581217,8.437154,15.81626,7.644559,6.406766,4.998296,11.731128,6.028879,17.11959
TCGA-Q1-A73P-01A-11R-A32P-07,0,483,figo_stage_1,16450,white,not hispanic or latino,8.036801,5.247645,9.894159,⋯,10.854487,5.629541,9.602922,14.17475,6.987468,6.731154,4.602649,9.293089,4.893018,16.64949
TCGA-C5-A8YT-01A-11R-A37O-07,1,633,figo_stage_1,13253,white,not hispanic or latino,7.830611,5.733875,12.445548,⋯,9.453187,6.398956,12.288955,13.39633,10.228758,8.542025,4.602649,11.765396,5.318924,13.55632
TCGA-C5-A2LZ-01A-11R-A213-07,1,3046,figo_stage_3,24059,white,not hispanic or latino,10.13555,4.865349,8.566221,⋯,11.492135,7.426145,9.480199,15.92715,6.010764,7.405245,4.602649,7.506731,5.423416,14.73802


In [87]:
get_high_low <- function(df, col_str, center) {
    col = as.name(col_str)
    df %>%
        mutate(high = !!col > 1.1 * center(!!col), "high_low" = ifelse(high == TRUE, "high", "low")) %>%
        select(-high)
}

In [92]:
col_i = "HRNR"
simp_survival_df <- joined_survival_counts_df %>%
    select(sample_name, vital_status, survival_time, !!as.name(col_i))
simp_survival_df <- get_high_low(simp_survival_df, col_i, mean)
# simp_survival_df

In [93]:
km_formula_str <- paste0("Surv(survival_time, vital_status) ~ ", "high_low")
km_formula <- as.formula(km_formula_str)

In [94]:
survdiff(km_formula, data = simp_survival_df)

Call:
survdiff(formula = km_formula, data = simp_survival_df)

                N Observed Expected (O-E)^2/E (O-E)^2/V
high_low=high  54       13     16.4     0.697     0.939
high_low=low  201       53     49.6     0.230     0.939

 Chisq= 0.9  on 1 degrees of freedom, p= 0.3 