In [1]:
library(tidyverse)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
dset_idx <- 3

In [4]:
matrisome_df <- load_matrisome_df(matrisome_path)

Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


# Load and filter survival data

In [5]:
event_code <- list("Alive" = 0, "Dead" = 1)
covariate_cols_no_figo <- c("age_at_diagnosis", "race", "ethnicity")
covariate_cols <- c("figo_stage", covariate_cols_no_figo)
dep_cols <- c("vital_status", "survival_time")

In [6]:
survival_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)


In [7]:
filtered_survival_df <- survival_df %>%
    dplyr::select(sample_name, vital_status, survival_time) %>%
    dplyr::filter(vital_status == event_code$Dead, rowSums(is.na(.)) == 0)
    
nrow(filtered_survival_df)
head(filtered_survival_df)

sample_name,vital_status,survival_time
<chr>,<dbl>,<dbl>
TCGA-A5-A2K4-01A-11R-A18M-07,1,871
TCGA-AJ-A23N-01A-11R-A22K-07,1,439
TCGA-EY-A3QX-01A-11R-A22K-07,1,989
TCGA-AJ-A3I9-01A-11R-A22K-07,1,519
TCGA-EY-A2ON-01A-21R-A18M-07,1,610
TCGA-AJ-A3EJ-01A-11R-A19W-07,1,50


# Load normalized matrisome count data

In [8]:
norm_matrisome_counts_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv")
norm_survival_counts_df <- read_tsv(norm_matrisome_counts_path) %>%
    dplyr::select(c("geneID", filtered_survival_df$sample_name))

Parsed with column specification:
cols(
  .default = col_double(),
  geneID = col_character()
)
See spec(...) for full column specifications.


In [9]:
norm_survival_counts_t_df <- norm_survival_counts_df %>%
    column_to_rownames(var = "geneID") %>%
    t() %>%
    as_tibble(rownames = "sample_name") %>%
    inner_join(filtered_survival_df, by = "sample_name") %>%
    dplyr::select(sample_name, survival_time, everything(), -vital_status)

In [10]:
nrow(norm_survival_counts_t_df)
head(norm_survival_counts_t_df)

sample_name,survival_time,PGF,TIMP4,C1QTNF6,TNC,PRL,OGN,C1QL3,FGB,⋯,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
TCGA-A5-A2K4-01A-11R-A18M-07,871,7.60723,4.169268,11.25537,12.235577,4.856636,4.405101,4.914654,7.523761,⋯,10.468971,5.479441,8.614214,12.57315,7.562306,7.088642,3.586865,9.456699,7.277034,15.67178
TCGA-AJ-A23N-01A-11R-A22K-07,439,7.523942,4.161205,10.830059,11.411592,4.393914,4.83987,4.83987,3.586865,⋯,10.035907,5.741274,10.296439,13.10422,7.351969,7.391397,4.248627,13.159221,6.602252,14.81338
TCGA-EY-A3QX-01A-11R-A22K-07,989,7.816775,4.066913,9.444013,13.437827,8.365137,4.865358,4.781944,3.586865,⋯,9.832998,5.048594,7.964041,13.32012,5.31761,8.156598,3.92709,9.277586,4.340787,16.14524
TCGA-AJ-A3I9-01A-11R-A22K-07,519,7.321404,5.142235,7.355847,11.544473,3.586865,7.249919,5.142235,3.586865,⋯,9.577459,4.656818,6.480327,11.28885,6.305301,9.278474,3.586865,10.459636,3.586865,11.84801
TCGA-EY-A2ON-01A-21R-A18M-07,610,8.274716,3.586865,10.434548,9.930904,4.951923,3.586865,5.57426,6.485578,⋯,10.581401,5.536235,7.092757,12.67511,5.013691,6.466455,3.586865,8.756968,5.07215,14.35194
TCGA-AJ-A3EJ-01A-11R-A19W-07,50,8.972874,5.187273,10.302243,12.990765,4.178013,4.933524,3.586865,3.586865,⋯,11.219672,6.172985,9.314071,13.14225,6.490858,7.914754,3.586865,11.75054,6.869177,13.5515


In [11]:
cor_test_df <- colwise_cor_test(
    norm_survival_counts_t_df,
    colnames(norm_survival_counts_t_df)[-c(1:2)],
    "survival_time",
    v = "geneID"
)

“the standard deviation is zero”

In [12]:
dim(cor_test_df)
head(cor_test_df)

geneID,cor,pval,n
<chr>,<dbl>,<dbl>,<int>
PGF,-0.07598693,0.72416586,24
TIMP4,0.05152558,0.81102341,24
C1QTNF6,0.12352882,0.56524457,24
TNC,-0.11081136,0.6062213,24
PRL,0.3476218,0.09602153,24
OGN,0.01062888,0.9606865,24


In [13]:
write_tsv(cor_test_df, paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_cor_results.tsv"))