In [1]:
library(tidyverse)
library(HDF5Array)
library(SummarizedExperiment)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: DelayedArray
Loading required package: stats4
Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    co

In [2]:
projects <- c("TCGA-CESC", "TCGA-OV", "TCGA-UCS", "TCGA-UCEC")
data_root <- "../../../../../mnt/d"
matrisome_file <- paste(data_root, "unified_TCGA_GTEx_data", "matrisome", "matrisome_hs_masterlist.tsv", sep = "/")

project_dirs <- sapply(
    projects, 
    (function(p) paste0(data_root, "/", "TCGA", "/", p, "-", "TCGAbiolinks"))
)

In [3]:
load_RSE_objects <- function(dir, projs) {
    data_ls <- list()
    for (i in seq_len(length(projs))) {
        dset_name <- sub("-", "_", projs[i])
        data_ls[[dset_name]] <- loadHDF5SummarizedExperiment(dir = dir, prefix = dset_name)
    }
    return(data_ls)
}


filter_sample_types <- function(rses, field, keepers) {
    names_ <- names(rses)
    filtered_data_ls <- list()
    for (n in names_) {
        mask <- rses[[n]][[field]] %in% keepers
        filtered_data_ls[[n]] <- rses[[n]][, mask]
    }
    return(filtered_data_ls)
}


prep_and_save_count_data <- function(rses, label_field, dest_dir, dest_subdir) {
    id_symbol_map <- as_tibble(rowData(rses[[1]]))
    for (n in names(rses)) {
        counts_df <- assays(rses[[n]])[["HTSeq - Counts"]] %>%
            as_tibble(rownames = "ensembl_gene_id") %>%
            inner_join(id_symbol_map, by = "ensembl_gene_id") %>%
            dplyr::select(ensembl_gene_id, external_gene_name, everything()) %>%
            dplyr::select(-original_ensembl_gene_id)
        
        condition_labels <- rses[[n]][[label_field]]
        coldata_df <- as_tibble(colnames(counts_df)[-c(1:2)]) %>%
            dplyr::rename(sample_name = value) %>%
            mutate(condition = condition_labels) %>%
            mutate(project = n)
        
        write_tsv(counts_df, path = paste0(dest_dir, "/", dest_subdir, "/", n, "_counts.tsv"))
        write_tsv(coldata_df, path = paste0(dest_dir, "/", dest_subdir, "/", n, "_coldata.tsv"))
    }
}

In [4]:
data_ls <- load_RSE_objects(paste0(data_root, "/", "TCGA", "/", "saved_RSE_objects"), projects)

In [5]:
tumor_data_ls <- filter_sample_types(data_ls, field = "definition", keepers = c("Primary solid Tumor"))

In [8]:
prep_and_save_count_data(
    rses = tumor_data_ls,
    label_field = "definition",
    dest_dir = data_root,
    dest_subdir = "TCGA/matrix_count_data"
)