In [118]:
library(tidyverse)
library(HDF5Array)
library(BiocParallel)
library(TCGAbiolinks)
library(SummarizedExperiment)
library(DESeq2)
library(biomaRt)

In [1]:
projects <- c("TCGA-CESC", "TCGA-OV", "TCGA-UCS", "TCGA-UCEC")
data_root <- "../../../../../mnt/d/TCGA"

project_dirs <- sapply(
    projects, 
    (function(p) paste0(data_root, "/", p, "-", "TCGAbiolinks"))
)

mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
hgnc_df <- getBM(filters = "ensembl_gene_id", attributes = c("ensembl_gene_id", "hgnc_symbol"), values = genes, mart = mart)

## Functions

In [262]:
load_RSE_objects <- function(dir, projs) {
    data_ls <- list()
    for (i in seq_len(length(projs))) {
        dset_name <- sub("-", "_", projs[i])
        data_ls[[dset_name]] <- loadHDF5SummarizedExperiment(dir = dir, prefix = dset_name)
    }
    return(data_ls)
}


filter_sample_types <- function(rses, field, keepers) {
    names_ <- names(rses)
    filtered_data_ls <- list()
    for (n in names_) {
        mask <- rses[[n]][[field]] %in% keepers
        filtered_data_ls[[n]] <- rses[[n]][, mask]
    }
    return(filtered_data_ls)
}


combine_data <- function(rses, dest_dir, hgnc_df) {
    for (n in names(rses)) {
        counts_df <- as_tibble(assays(rses[[n]])[["HTSeq - Counts"]], rownames = "ensembl_gene_id") %>%
            dplyr::inner_join(hgnc_df, by = "ensembl_gene_id") %>%
            dplyr::select(hgnc_symbol, everything()) %>%
            dplyr::select(-ensembl_gene_id) %>%
            dplyr::filter(hgnc_symbol != "") %>%
            dplyr::group_by(hgnc_symbol) %>%
            summarize_all(sum) %>%
            ungroup()
        coldata_df <- as_tibble(colnames(counts_df)[-1]) %>%
            dplyr::rename(sample_name = value) %>%
            mutate(condition = "tumor") %>%
            mutate(project = n)
        write_tsv(counts_df, path = paste0(dest_dir, "/", "tumor_only_matrix_data", "/", n, "_counts.tsv"))
        write_tsv(coldata_df, path = paste0(dest_dir, "/", "tumor_only_matrix_data", "/", n, "_coldata.tsv"))
    }
}

In [41]:
data_ls <- load_RSE_objects(paste0(data_root, "/", "saved_RSE_objects"), projects)

In [63]:
tumor_data_ls <- filter_sample_types(data_ls, "definition", c("Primary solid Tumor"))

In [263]:
combine_data(tumor_data_ls, data_root, hgnc_df)