In [29]:
library(tidyverse)
library(HDF5Array)
library(SummarizedExperiment)
library(TCGAbiolinks)

# Custom package
library(rutils)

In [30]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
RSE_objects_dir <- "saved_RSE_objects"
biolinks_dir <- "tcga_biolinks_downloads"
TCGA_dest_dir <- paste0(dirs$data_dir, "/", biolinks_dir)
RSE_objects_dest_dir <- paste0(dirs$data_dir, "/", RSE_objects_dir)

In [31]:
data_ls <- load_RSE_objects(RSE_objects_dest_dir, projects, paste0(projects, "_RNA_"))

# Collect survival/metadata

Survival data is only relevant for patients with tumors who are no longer alive, so this dataset will be strictly smaller
(in terms of samples) than the full expression counts

In [34]:
get_unified_tumor_coldata <- function(dset) {
    unified_coldata_df <- read_tsv(paste0(dirs$data_dir, "/", dset, "/coldata.tsv"))
    return(
        unified_coldata_df %>%
            dplyr::filter(condition == "tumor")
    )
}


get_clinical_df <- function(RSE_obj) {
    race_df <- tibble(race = RSE_obj$race)
    race_one_hot <- model.matrix(~race - 1, model.frame(~ ., race_df, na.action = na.pass))
#     race_one_hot <- model.matrix(~race, model.frame(~ ., race_df, na.action = na.pass))
    colnames(race_one_hot) <- gsub(" ", "_", colnames(race_one_hot))
    colnames(race_one_hot) <- sub("race", "race_", colnames(race_one_hot))

    ethnicity_df <- tibble(ethnicity = RSE_obj$ethnicity)
    ethnicity_one_hot <- model.matrix(~ethnicity - 1, model.frame(~ ., ethnicity_df, na.action = na.pass))
#     ethnicity_one_hot <- model.matrix(~ethnicity, model.frame(~ ., ethnicity_df, na.action = na.pass))
    colnames(ethnicity_one_hot) <- gsub(" ", "_", colnames(ethnicity_one_hot))
    colnames(ethnicity_one_hot) <- gsub("ethnicity", "ethnicity_", colnames(ethnicity_one_hot))

    clinical_df <- as_tibble(colData(RSE_obj)) %>%
        bind_cols(as_tibble(race_one_hot)) %>%
        bind_cols(as_tibble(ethnicity_one_hot))
    return(clinical_df)
}


get_survival_df <- function(unified_tumor_samples_df, clinical_df, keeper_cols) {
    survival_df <- unified_tumor_samples_df %>%
        left_join(clinical_df, by = c("sample_name" = "barcode")) %>%
        mutate(survival_time = pmax(days_to_last_follow_up, days_to_death, na.rm = TRUE)) %>%
        dplyr::select(one_of(keeper_cols))
    return(survival_df)
}

In [35]:
for (i in seq_len(length(unified_dsets))) {
    unified_tumor_samples_df <- get_unified_tumor_coldata(unified_dsets[i]) %>%
        dplyr::select(sample_name)
    clinical_df <- get_clinical_df(data_ls[[i]])
    race_cols <- colnames(clinical_df)[grepl("^race_", colnames(clinical_df))]
    ethnicity_cols <- colnames(clinical_df)[grepl("^ethnicity_", colnames(clinical_df))]
    keeper_cols <- c("sample_name", "vital_status", "days_to_last_follow_up", "days_to_death", "survival_time", "age_at_diagnosis", "age_at_index", "height", "weight", "bmi", race_cols, ethnicity_cols)
    survival_df <- get_survival_df(unified_tumor_samples_df, clinical_df, keeper_cols)
    
    write_tsv(survival_df, paste0(dirs$data_dir, "/", unified_dsets[i], "/survival_data.tsv"))
}

Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
