In [2]:
library(tidyverse)
library(HDF5Array)
library(SummarizedExperiment)
library(EnsDb.Hsapiens.v86)
library(biomaRt)

In [3]:
projects <- c("TCGA-CESC", "TCGA-OV", "TCGA-UCS", "TCGA-UCEC")
data_root <- "../../../../../mnt/d/TCGA"

project_dirs <- sapply(
    projects, 
    (function(p) paste0(data_root, "/", p, "-", "TCGAbiolinks"))
)

In [4]:
load_RSE_objects <- function(dir, projs) {
    data_ls <- list()
    for (i in seq_len(length(projs))) {
        dset_name <- sub("-", "_", projs[i])
        data_ls[[dset_name]] <- loadHDF5SummarizedExperiment(dir = dir, prefix = dset_name)
    }
    return(data_ls)
}


filter_sample_types <- function(rses, field, keepers) {
    names_ <- names(rses)
    filtered_data_ls <- list()
    for (n in names_) {
        mask <- rses[[n]][[field]] %in% keepers
        filtered_data_ls[[n]] <- rses[[n]][, mask]
    }
    return(filtered_data_ls)
}


combine_data <- function(rses, dest_dir, hgnc_df) {
    for (n in names(rses)) {
        counts_df <- as_tibble(assays(rses[[n]])[["HTSeq - Counts"]], rownames = "ensembl_gene_id") %>%
            dplyr::inner_join(hgnc_df, by = "ensembl_gene_id") %>%
            dplyr::select(hgnc_symbol, everything()) %>%
            dplyr::select(-ensembl_gene_id) %>%
            dplyr::filter(hgnc_symbol != "") %>%
            dplyr::group_by(hgnc_symbol) %>%
            summarize_all(sum) %>%
            ungroup()
        coldata_df <- as_tibble(colnames(counts_df)[-1]) %>%
            dplyr::rename(sample_name = value) %>%
            mutate(condition = "tumor") %>%
            mutate(project = n)
        write_tsv(counts_df, path = paste0(dest_dir, "/", "tumor_only_matrix_data", "/", n, "_counts.tsv"))
        write_tsv(coldata_df, path = paste0(dest_dir, "/", "tumor_only_matrix_data", "/", n, "_coldata.tsv"))
    }
}

In [5]:
data_ls <- load_RSE_objects(paste0(data_root, "/", "saved_RSE_objects"), projects)
tcga_ensembl_ids <- rowData(data_ls$TCGA_CESC)$ensembl_gene_id %>%
    as_tibble()

In [90]:
assays(data_ls$TCGA_CESC)[["HTSeq - Counts"]]

<56493 x 309> matrix of class DelayedMatrix and type "double":
                TCGA-JX-A3Q8-01A-11R-A21T-07 ... TCGA-JW-A5VK-01A-11R-A28H-07
ENSG00000000003                         9424   .                         3515
ENSG00000000005                            4   .                            1
ENSG00000000419                         3103   .                         3715
ENSG00000000457                          842   .                          606
ENSG00000000460                          772   .                          685
            ...                            .   .                            .
ENSG00000281904                            2   .                            0
ENSG00000281909                            0   .                            1
ENSG00000281910                            0   .                            0
ENSG00000281912                          164   .                          122
ENSG00000281920                            3   .                           10

# Gene match using `EnsDb.Hsapiens.v86`

In [6]:
edb_hgnc_df <- ensembldb::select(
    EnsDb.Hsapiens.v86,
    keys = tcga_ensembl_ids$value,
    keytype = "GENEID",
    columns = c("SYMBOL", "GENEID")
)

edb_hgnc_df <- edb_hgnc_df %>%
    dplyr::rename(ensembl_gene_id = GENEID, hgnc_symbol = SYMBOL) %>%
    dplyr::select(ensembl_gene_id, hgnc_symbol)

In [13]:
length(unique(edb_hgnc_df$hgnc_symbol))

In [10]:
edb_hgnc_df

ensembl_gene_id,hgnc_symbol
<chr>,<chr>
ENSG00000000003,TSPAN6
ENSG00000000005,TNMD
ENSG00000000419,DPM1
ENSG00000000457,SCYL3
ENSG00000000460,C1orf112
ENSG00000000938,FGR
ENSG00000000971,CFH
ENSG00000001036,FUCA2
ENSG00000001084,GCLC
ENSG00000001167,NFYA


In [6]:
tumor_data_ls <- filter_sample_types(data_ls, "definition", c("Primary solid Tumor"))

In [8]:
combine_data(tumor_data_ls, data_root, edb_hgnc_df)

In [50]:
res <- rowData(data_ls$TCGA_CESC) %>%
    as_tibble()  %>%
    dplyr::select(ensembl_gene_id, external_gene_name) %>%
    mutate(edbres_gene_id = edb_hgnc_df$ensembl_gene_id, edbres_gene_name = edb_hgnc_df$hgnc_symbol)

In [52]:
head(res)

ensembl_gene_id,external_gene_name,edbres_gene_id,edbres_gene_name
<chr>,<chr>,<chr>,<chr>
ENSG00000000003,TSPAN6,ENSG00000000003,TSPAN6
ENSG00000000005,TNMD,ENSG00000000005,TNMD
ENSG00000000419,DPM1,ENSG00000000419,DPM1
ENSG00000000457,SCYL3,ENSG00000000457,SCYL3
ENSG00000000460,C1orf112,ENSG00000000460,C1orf112
ENSG00000000938,FGR,ENSG00000000938,FGR


In [54]:
all(res$ensembl_gene_id == res$edbres_gene_id)

In [56]:
sum(res$external_gene_name != res$edbres_gene_name)

In [60]:
nrow(res)

In [58]:
length(unique(res$external_gene_name))

In [59]:
length(unique(res$edbres_gene_name))

# Check against matrisome genes

In [65]:
matrisome_df <- read_tsv(paste0(data_root, "/../", "unified_TCGA_GTEx_data", "/", "matrisome", "/", "matrisome_hs_masterlist.tsv"), quote = "")

Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


In [75]:
matrisome_df

division,category,gene_symbol,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
Core matrisome,ECM Glycoproteins,ABI3BP,"ABI family, member 3 (NESH) binding protein",FLJ41743|FLJ41754|NESHBP|TARSH,17265,17265,B4DSV9:D3YTG3:E9PPR9:E9PRB5:H0Y897:H0YCG4:H0YCP4:H0YDN0:H0YDW0:H0YEA0:H0YEL2:H0YF18:H0YF57:H7C4H3:H7C4N5:H7C4S3:H7C4T1:H7C4X4:H7C524:H7C556:H7C5S3:Q5JPC9:Q7Z7G0,NP_056244.2:XP_005247340.1,Mouse:Abi3bp|,
Core matrisome,ECM Glycoproteins,ADIPOQ,"adiponectin, C1Q and collagen domain containing",ACDC|ACRP30|ADIPQTL1|ADPN|APM-1|APM1|GBP28|adipone,13633,13633,Q15848,NP_001171271.1:NP_004788.1,Mouse:Adipoq|,
Core matrisome,ECM Glycoproteins,AEBP1,AE binding protein 1,ACLP; FLJ33612,303,303,C9JLQ8:H7C0W8:H7C1J5:H7C391:H7C3D7:H7C4B5:Q8IUX7,NP_001120.3,Mouse:Aebp1|,
Core matrisome,ECM Glycoproteins,AGRN,agrin,FLJ45064,329,329,H0Y5U1:O00468,NP_940978.2:XP_005244806.1:XP_006710696.1,Mouse:Agrn|,
Core matrisome,ECM Glycoproteins,AMBN,ameloblastin (enamel matrix protein),-,452,452,Q9NP70,NP_057603.1,Mouse:Ambn|,
Core matrisome,ECM Glycoproteins,AMELX,"amelogenin (amelogenesis imperfecta 1, X-linked)",AIH1|ALGN|AMG|AMGL|AMGX,461,461,Q99217,NP_001133.1:NP_872621.1:NP_872622.1,Mouse:Amelx|,
Core matrisome,ECM Glycoproteins,AMELY,"amelogenin, Y-linked",AMGL|AMGY,462,462,J3KPK5:Q99218,NP_001134.1,,
Core matrisome,ECM Glycoproteins,BGLAP,bone gamma-carboxyglutamate (gla) protein,BGP|OC|PMF1,1043,1043,P02818,NP_954642.1,Mouse:Bglap2|,
Core matrisome,ECM Glycoproteins,BMPER,BMP binding endothelial regulator,CRIM3|CV-2|CV2,24154,24154,C9JY72:F8WDG9:G5E9K4:Q8N8U9,NP_597725.1,Mouse:Bmper|,Growth Factor-binding
Core matrisome,ECM Glycoproteins,BSPH1,binder of sperm protein homolog 1,BSP1|ELSPBP2,33906,33906,Q075Z2,NP_001121798.1,Mouse:Bsph1|,


In [78]:
colnames(matrisome_df) <- map(sub(" ", "_", colnames(matrisome_df)), tolower)
matrisome_df <- matrisome_df %>%
    dplyr::filter(division != "Retired")
dim(matrisome_df)

In [79]:
sum(matrisome_df$gene_symbol %in% res$external_gene_name)

In [80]:
sum(matrisome_df$gene_symbol %in% res$edbres_gene_name)

In [81]:
1027 - 1012

In [82]:
1027 - 1004

# Any duplicated gene symbols WRT matrisome genes?

In [97]:
res$external_gene_name[duplicated(res$external_gene_name)] %in% matrisome_df$gene_symbol

In [103]:
edb_query_dups <- res$edbres_gene_name[duplicated(res$edbres_gene_name)]
msk <- edb_query_dups %in% matrisome_df$gene_symbol
edb_query_dups[msk]