In [1]:
library(tidyverse)
library(HDF5Array)
library(SummarizedExperiment)
library(EnsDb.Hsapiens.v86)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: DelayedArray
Loading required package: stats4
Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    co

In [2]:
projects <- c("TCGA-CESC", "TCGA-OV", "TCGA-UCS", "TCGA-UCEC")
data_root <- "../../../../../mnt/d/TCGA"

project_dirs <- sapply(
    projects, 
    (function(p) paste0(data_root, "/", p, "-", "TCGAbiolinks"))
)

In [3]:
load_RSE_objects <- function(dir, projs) {
    data_ls <- list()
    for (i in seq_len(length(projs))) {
        dset_name <- sub("-", "_", projs[i])
        data_ls[[dset_name]] <- loadHDF5SummarizedExperiment(dir = dir, prefix = dset_name)
    }
    return(data_ls)
}


filter_sample_types <- function(rses, field, keepers) {
    names_ <- names(rses)
    filtered_data_ls <- list()
    for (n in names_) {
        mask <- rses[[n]][[field]] %in% keepers
        filtered_data_ls[[n]] <- rses[[n]][, mask]
    }
    return(filtered_data_ls)
}


combine_data <- function(rses, dest_dir, hgnc_df) {
    for (n in names(rses)) {
        counts_df <- as_tibble(assays(rses[[n]])[["HTSeq - Counts"]], rownames = "ensembl_gene_id") %>%
            dplyr::inner_join(hgnc_df, by = "ensembl_gene_id") %>%
            dplyr::select(hgnc_symbol, everything()) %>%
            dplyr::select(-ensembl_gene_id) %>%
            dplyr::filter(hgnc_symbol != "") %>%
            dplyr::group_by(hgnc_symbol) %>%
            summarize_all(sum) %>%
            ungroup()
        coldata_df <- as_tibble(colnames(counts_df)[-1]) %>%
            dplyr::rename(sample_name = value) %>%
            mutate(condition = "tumor") %>%
            mutate(project = n)
        write_tsv(counts_df, path = paste0(dest_dir, "/", "tumor_only_matrix_data", "/", n, "_counts.tsv"))
        write_tsv(coldata_df, path = paste0(dest_dir, "/", "tumor_only_matrix_data", "/", n, "_coldata.tsv"))
    }
}

In [4]:
data_ls <- load_RSE_objects(paste0(data_root, "/", "saved_RSE_objects"), projects)
tcga_ensembl_ids <- rowData(data_ls$TCGA_CESC)$ensembl_gene_id %>%
    as_tibble()

# Gene match using `EnsDb.Hsapiens.v86`

In [5]:
edb_hgnc_df <- ensembldb::select(
    EnsDb.Hsapiens.v86,
    keys = tcga_ensembl_ids$value,
    keytype = "GENEID",
    columns = c("SYMBOL", "GENEID")
)

edb_hgnc_df <- edb_hgnc_df %>%
    dplyr::rename(ensembl_gene_id = GENEID, hgnc_symbol = SYMBOL) %>%
    dplyr::select(ensembl_gene_id, hgnc_symbol)

In [6]:
tumor_data_ls <- filter_sample_types(data_ls, "definition", c("Primary solid Tumor"))

In [8]:
combine_data(tumor_data_ls, data_root, edb_hgnc_df)