In [1]:
library(tidyverse)
library(HDF5Array)
library(SummarizedExperiment)
library(TCGAbiolinks)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: DelayedArray
Loading required package: stats4
Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    co

In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-OV", "TCGA-UCS", "TCGA-UCEC")
unified_dsets <- c("unified_cervical_data")
RSE_objects_dir <- "saved_RSE_objects"
biolinks_dir <- "tcga_biolinks_downloads"
TCGA_dest_dir <- paste0(dirs$data_dir, "/", biolinks_dir)
RSE_objects_dest_dir <- paste0(dirs$data_dir, "/", RSE_objects_dir)

In [3]:
data_ls <- load_RSE_objects(RSE_objects_dest_dir, projects, paste0(projects, "_RNA_"))

# Collect survival/metadata

Survival data is only relevant for patients with tumors who are no longer alive, so this dataset will be strictly smaller
(in terms of samples) than the full expression counts

In [4]:
get_unified_tumor_coldata <- function(dset) {
    unified_coldata_df <- read_tsv(paste0(dirs$data_dir, "/", dset, "/coldata.tsv"))
    return(
        unified_coldata_df %>%
            dplyr::filter(condition == "tumor")
    )
}


get_survival_meta <- function(RSE_obj) {
    race_df <- tibble(race = RSE_obj$race)
    race_one_hot <- model.matrix(~race - 1, race_df)
    colnames(race_one_hot) <- gsub(" ", "_", colnames(race_one_hot))
    colnames(race_one_hot) <- sub("race", "race_", colnames(race_one_hot))
    
    survival_meta_df <- tibble(
        barcode = RSE_obj$barcode,
        days_to_death = RSE_obj$days_to_death,
        vital_status = RSE_obj$vital_status,
        definition = RSE_obj$definition,
        age_at_diagnosis = RSE_obj$age_at_diagnosis,
        bmi = RSE_obj$bmi
    ) %>% bind_cols(as_tibble(race_one_hot))
}


get_unified_survival_meta <- function(unified_tumor_df, survival_df) {
    unified_survival_df <- unified_tumor_df %>%
        left_join(survival_df, by = c("sample_name" = "barcode"), keep = TRUE) %>%
        mutate(present_in_harmonized = !is.na(barcode)) %>%
        dplyr::select(-barcode) %>%    # Since we have "present_in_harmonized", this is no longer needed
        dplyr::filter(present_in_harmonized == TRUE) %>%
        dplyr::filter(definition == "Primary solid Tumor") %>%
        dplyr::select(-definition)    # "definition" is now redundant with "condition"
}

In [5]:
for (i in seq_len(length(unified_dsets))) {
    unified_tumor_coldata_df <- get_unified_tumor_coldata(unified_dsets[i])
    survival_meta_df <- get_survival_meta(data_ls[[projects[i]]])
    unified_survival_meta_df <- get_unified_survival_meta(unified_tumor_coldata_df, survival_meta_df)
    
    filtered_unified_survival_meta_df <- unified_survival_meta_df %>%
        dplyr::filter(!is.na(days_to_death)) %>%
        dplyr::filter(present_in_harmonized == TRUE) %>%
        dplyr::select(-present_in_harmonized)
    
    write_tsv(filtered_unified_survival_meta_df, paste0(dirs$data_dir, "/", unified_dsets[i], "/survival_data.tsv"))
}

Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
