In [6]:
library(tidyverse)
library(org.Hs.eg.db)


# Custom package
library(rutils)

In [7]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")

In [8]:
matrisome_df <- load_matrisome_df(paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv"))

Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


In [9]:
for (dset_idx in 1:3) {
    gene_map_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/counts.tsv")) %>%
        dplyr::select(Hugo_Symbol, Entrez_Gene_Id) %>%
        dplyr::rename_all(tolower)

    # Fill in missing Entrez IDs if possible
    db_gene_map <- mapIds(
        x = org.Hs.eg.db,
        keys = gene_map_df %>% dplyr::pull(hugo_symbol),
        column = "ENTREZID",
        keytype = "SYMBOL",
        multiVals = "first"
    )

    db_gene_map_df <- as_tibble(db_gene_map, rownames = "hugo_symbol") %>%
        dplyr::rename(db_entrez_gene_id = value)

    # Fill in missing entrez IDs with non-NA org.Hs.eg.db entrez IDs
    # Also, for non-missing entrez IDs, we prefer the ones from org.Hs.eg.db
    gene_map_df <- gene_map_df %>%
        inner_join(db_gene_map_df, by = "hugo_symbol") %>%
    #     dplyr::mutate(entrez_gene_id = ifelse(entrez_gene_id == 0 & !is.na(db_entrez_gene_id), db_entrez_gene_id, entrez_gene_id)) %>%
        dplyr::mutate(entrez_gene_id = ifelse(!is.na(db_entrez_gene_id), db_entrez_gene_id, entrez_gene_id)) %>%
        dplyr::select(-db_entrez_gene_id)

    no_entrez_genes <- gene_map_df %>%
        dplyr::filter(entrez_gene_id == 0) %>%
        dplyr::pull(hugo_symbol)
    # Still missing a few genes
    length(no_entrez_genes)

    write_tsv(gene_map_df, paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/hugo2entrez.tsv"))
}

Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
'select()' returned 1:many mapping between keys and columns
Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
'select()' returned 1:many mapping between keys and columns
Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
'select()' returned 1:many mapping between keys and columns
