In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


## Constants

In [95]:
data_dir <- "../../../../../mnt/d"
analysis_dir <- paste0(data_dir, "/", "unified_TCGA_GTEx/analysis")
master_list <- "matrisome/matrisome_hs_masterlist.tsv"
DEG_list <- "unified_TCGA_GTEx/analysis/unified_cervical_data_DGE_padj0.05.tsv"

In [74]:
DEG_df <- read_tsv(paste0(data_dir, "/", DEG_list))
# some cells have double quotes inside of them
matrisome_df <- read_tsv(paste0(data_dir, "/", master_list), quote = "")
colnames(matrisome_df) <- map(sub(" ", "_", colnames(matrisome_df)), tolower)
matrisome_df <- select(matrisome_df, gene_symbol, everything()) %>%
    dplyr::filter(division != "Retired")

Parsed with column specification:
cols(
  geneID = col_character(),
  baseMean = col_double(),
  log2FoldChange = col_double(),
  lfcSE = col_double(),
  stat = col_double(),
  pvalue = col_double(),
  padj = col_double()
)
Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


In [75]:
# Importand for next step that this is FALSE
any(is.na(matrisome_df$division))

In [84]:
joined_df <- left_join(DEG_df, matrisome_df, by = c("geneID" = "gene_symbol")) %>%
    mutate(in_matrisome = !is.na(division)) %>%    # We know a priori that division is never NA
    select(geneID:padj, in_matrisome, everything())

In [86]:
nrow(DEG_df) == nrow(joined_df)    # This must be true since we want to retain all DEGs
head(joined_df)

geneID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,in_matrisome,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
RADIL,104.93138,-3.1693618,0.4553155,-6.960803,3.383386e-12,2.413245e-11,False,,,,,,,,,,
AP1M2,3876.50373,2.7277278,0.2544959,10.718161,8.36496e-27,2.526089e-25,False,,,,,,,,,,
TAF1,2646.98788,-0.4107185,0.1451662,-2.829299,0.004665009,0.009166112,False,,,,,,,,,,
KLF1,6.54729,2.9105572,0.5875117,4.954041,7.268772e-07,2.652257e-06,False,,,,,,,,,,
USHBP1,83.24046,-3.7332879,0.2856996,-13.06718,5.071048000000001e-39,3.374224e-37,False,,,,,,,,,,
SGCA,61.36091,-5.3402712,0.4588713,-11.63784,2.646313e-31,1.07102e-29,False,,,,,,,,,,


In [93]:
category_counts_df <- joined_df %>% count(category) %>%
    mutate(n_genes = n) %>%
    select(-n) %>%
    filter(!is.na(category))

category_counts_df

category,n_genes
<chr>,<int>
Collagens,30
ECM Glycoproteins,122
ECM Regulators,150
ECM-affiliated Proteins,104
Proteoglycans,24
Secreted Factors,195


In [96]:
write_tsv(joined_df, paste0(analysis_dir, "/", "DGE_matrisome_left_join.tsv"))
write_tsv(category_counts_df, paste0(analysis_dir, "/", "DGE_matrisome_category_counts.tsv"))