In [1]:
library(tidyverse)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
i <- 1

In [4]:
matrisome_df <- load_matrisome_df(matrisome_path)

Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


# Load and filter survival data

In [5]:
event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols_no_figo <- c("age_at_diagnosis", "race", "ethnicity")
covariate_cols <- c("figo_stage", covariate_cols_no_figo)
dep_cols <- c("vital_status", "survival_time")

In [6]:
survival_path <- paste0(dirs$data_dir, "/", unified_dsets[i], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)


In [26]:
filtered_survival_df <- survival_df %>%
    dplyr::select(sample_name, vital_status, survival_time) %>%
    dplyr::filter(vital_status == event_code$Dead, rowSums(is.na(.)) == 0)
    
nrow(filtered_survival_df)
head(filtered_survival_df)

sample_name,vital_status,survival_time
<chr>,<dbl>,<dbl>
TCGA-C5-A1BF-01B-11R-A13Y-07,1,570
TCGA-C5-A8YT-01A-11R-A37O-07,1,633
TCGA-C5-A2LZ-01A-11R-A213-07,1,3046
TCGA-VS-A9V1-01A-11R-A42T-07,1,157
TCGA-C5-A1BE-01B-11R-A13Y-07,1,2094
TCGA-C5-A8XH-01A-11R-A37O-07,1,1394


# Load normalized matrisome count data

In [27]:
norm_matrisome_counts_path <- paste0(dirs$data_dir, "/", unified_dsets[i], "/norm_matrisome_counts.tsv")
norm_survival_counts_df <- read_tsv(norm_matrisome_counts_path) %>%
    dplyr::select(c("geneID", filtered_survival_df$sample_name))

Parsed with column specification:
cols(
  .default = col_double(),
  geneID = col_character()
)
See spec(...) for full column specifications.


In [28]:
norm_survival_counts_t_df <- norm_survival_counts_df %>%
    column_to_rownames(var = "geneID") %>%
    t() %>%
    as_tibble(rownames = "sample_name") %>%
    inner_join(filtered_survival_df, by = "sample_name") %>%
    dplyr::select(sample_name, survival_time, everything(), -vital_status)

In [29]:
nrow(norm_survival_counts_t_df)
head(norm_survival_counts_t_df)

sample_name,survival_time,PGF,TIMP4,C1QTNF6,TNC,PRL,OGN,C1QL3,FGB,⋯,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
TCGA-C5-A1BF-01B-11R-A13Y-07,570,10.800637,6.228003,11.669331,13.00293,5.063964,4.869744,5.063964,8.834522,⋯,9.013453,8.190325,9.503647,14.07799,6.569726,7.315604,4.602649,12.0623,5.649441,16.55841
TCGA-C5-A8YT-01A-11R-A37O-07,633,7.830611,5.733875,12.445548,13.76547,5.455125,13.049104,5.146455,5.074289,⋯,9.453187,6.398956,12.288955,13.39633,10.228758,8.542025,4.602649,11.765396,5.318924,13.55632
TCGA-C5-A2LZ-01A-11R-A213-07,3046,10.13555,4.865349,8.566221,10.84231,4.865349,5.241781,6.669045,4.602649,⋯,11.492135,7.426145,9.480199,15.92715,6.010764,7.405245,4.602649,7.506731,5.423416,14.73802
TCGA-VS-A9V1-01A-11R-A42T-07,157,6.8931,6.334201,10.871571,12.21604,4.602649,5.153289,5.153289,6.74941,⋯,9.31918,7.44438,8.707254,12.23439,6.011689,10.391491,4.602649,7.767144,5.376786,16.71254
TCGA-C5-A1BE-01B-11R-A13Y-07,2094,10.642039,5.348449,8.94522,13.41922,4.602649,5.867905,5.646251,4.602649,⋯,11.696884,6.38498,9.774029,15.38194,6.205261,7.163053,4.957257,10.113788,5.213815,15.56497
TCGA-C5-A8XH-01A-11R-A37O-07,1394,9.633752,5.908552,11.672191,13.86377,4.602649,5.339887,5.702581,5.3896,⋯,11.148165,7.52614,8.561116,14.40442,6.462928,6.10257,4.602649,9.104547,5.088257,15.19735


In [31]:
cor_test_df <- colwise_cor_test(
    norm_survival_counts_t_df,
    colnames(norm_survival_counts_t_df)[-c(1:2)],
    "survival_time",
    v = "geneID"
)

“the standard deviation is zero”

In [32]:
cor_test_df

geneID,cor,pval,n
<chr>,<dbl>,<dbl>,<int>
PGF,-0.032679924,0.794485949,66
TIMP4,-0.115274808,0.356694904,66
C1QTNF6,-0.321807438,0.008415805,66
TNC,0.034756367,0.781741120,66
PRL,-0.102726232,0.411771043,66
OGN,0.162835258,0.191436111,66
C1QL3,-0.014145547,0.910245522,66
FGB,-0.153981575,0.217043333,66
NDNF,-0.055158147,0.660023859,66
CCL22,0.264248155,0.032031795,66


In [33]:
write_tsv(cor_test_df, paste0(dirs$analysis_dir, "/", unified_dsets[i], "_cor_results.tsv"))