In [1]:
library(tidyverse)
library(HDF5Array)
library(SummarizedExperiment)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: DelayedArray
Loading required package: stats4
Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    co

In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
RSE_objects_dir <- "saved_RSE_objects"
biolinks_dir <- "tcga_biolinks_downloads"
TCGA_dest_dir <- paste0(dirs$data_dir, "/", biolinks_dir)
RSE_objects_dest_dir <- paste0(dirs$data_dir, "/", RSE_objects_dir)

In [3]:
data_ls <- load_RSE_objects(RSE_objects_dest_dir, projects, paste0(projects, "_RNA_"))

In [4]:
get_unified_tumor_coldata <- function(dset) {
    unified_coldata_df <- read_tsv(paste0(dirs$data_dir, "/", dset, "/coldata.tsv"))
    return(
        unified_coldata_df %>%
            dplyr::filter(condition == "tumor")
    )
}


get_clinical_df <- function(RSE_obj) {
    clinical_df <- as_tibble(colData(RSE_obj))
    return(clinical_df)
}


get_survival_df <- function(unified_coldata_df, clinical_df, keeper_cols) {
    survival_df <- unified_coldata_df %>%
        left_join(clinical_df, by = c("sample_name" = "barcode")) %>%
        mutate("survival_time" = pmax(days_to_last_follow_up, days_to_death, na.rm = TRUE)) %>%
        dplyr::select(one_of(keeper_cols))
    return(survival_df)
}

In [5]:
keeper_cols <- c(
    "sample_name", "vital_status", "survival_time", "figo_stage", "days_to_last_follow_up", "days_to_death", "age_at_diagnosis", "age_at_index", "height", "weight", "bmi", "race", "ethnicity"
)

for (i in seq_len(length(unified_dsets))) {
    unified_tumor_coldata_df <- get_unified_tumor_coldata(unified_dsets[i])
    clinical_df <- get_clinical_df(data_ls[[i]])
    survival_df <- get_survival_df(unified_tumor_coldata_df, clinical_df, keeper_cols)
    
    write_tsv(survival_df, paste0(dirs$data_dir, "/", unified_dsets[i], "/survival_data.tsv"))
}


Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
