In [1]:
library(tidyverse)

# Custom package
library(rutils)


-- [1mAttaching packages[22m -------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.6     [32mv[39m [34mdplyr  [39m 1.0.4
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ----------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
# Define constants and load data
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

event_code <- list("Alive" = 0, "Dead" = 1)
# covariate_cols_no_figo <- c("age_at_diagnosis", "race", "ethnicity")
# covariate_cols <- c("figo_stage", covariate_cols_no_figo)
dep_cols <- c("vital_status", "survival_time")

In [3]:
dset_idx <- 3

# Overall

In [4]:
counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/counts.tsv")) %>%
    rename(geneID = Hugo_Symbol)
m_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/matrisome_counts.tsv"))


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  Hugo_Symbol = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  geneID = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.




In [5]:
length(counts_df$geneID)

In [6]:
length(m_counts_df$geneID)

# Tumor & normal

In [7]:
coldata_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/coldata.tsv"))


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------[39m
cols(
  sample_name = [31mcol_character()[39m,
  condition = [31mcol_character()[39m,
  data_source = [31mcol_character()[39m
)




In [8]:
coldata_df %>% group_by(condition) %>% count()
nrow(coldata_df)

condition,n
<chr>,<int>
healthy,105
tumor,141


# Survival

In [9]:
# Load and filter survival data
survival_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)
filtered_survival_df <- survival_df %>%
    dplyr::select(sample_name, vital_status, survival_time) %>%
    dplyr::filter(vital_status == event_code$Dead, rowSums(is.na(.)) == 0)

# Load normalized matrisome count data
norm_matrisome_counts_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv")

norm_survival_counts_df <- read_tsv(norm_matrisome_counts_path) %>%
    dplyr::select(c("geneID", filtered_survival_df$sample_name))

norm_matrisome_counts_t_df <- read_tsv(norm_matrisome_counts_path) %>%
    column_to_rownames(var = "geneID") %>%
    t() %>%
    as_tibble(rownames = "sample_name")

norm_survival_counts_t_df <- norm_survival_counts_df %>%
    column_to_rownames(var = "geneID") %>%
    t() %>%
    as_tibble(rownames = "sample_name") %>%
    inner_join(filtered_survival_df, by = "sample_name") %>%
    dplyr::select(sample_name, survival_time, everything(), -vital_status)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------[39m
cols(
  sample_name = [31mcol_character()[39m,
  vital_status = [31mcol_character()[39m,
  survival_time = [32mcol_double()[39m,
  figo_stage = [31mcol_character()[39m,
  days_to_last_follow_up = [32mcol_double()[39m,
  days_to_death = [32mcol_double()[39m,
  age_at_diagnosis = [32mcol_double()[39m,
  age_at_index = [32mcol_double()[39m,
  height = [32mcol_double()[39m,
  weight = [32mcol_double()[39m,
  bmi = [32mcol_double()[39m,
  race = [31mcol_character()[39m,
  ethnicity = [31mcol_character()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  geneID = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifica

In [10]:
full_cohort <- nrow(norm_matrisome_counts_t_df)
dec_cohort <- nrow(norm_survival_counts_t_df)

full_cohort
dec_cohort
dec_cohort / full_cohort