In [1]:
library(tidyverse)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


# Constants

In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
dsets <- c("unified_cervical_data")
dset_paths <- unlist(map(dsets, function(d) paste0(dirs$data_dir, "/", d)))
matrisome_list <- matrisome_list <- paste(dirs$data_dir, "matrisome", "matrisome_hs_masterlist.tsv", sep = "/")
dset_idx <- 1

In [35]:
dge_res_df <- read_tsv(paste0(dirs$analysis_dir, "/", dsets[dset_idx], "_unfiltered_DESeq_results.tsv"))
counts_df <- read_tsv(paste0(dirs$data_dir, "/", dsets[dset_idx], "/counts.tsv")) %>%
    dplyr::rename("geneID" = "Hugo_Symbol") %>%
    dplyr::select(-"Entrez_Gene_Id")

Parsed with column specification:
cols(
  geneID = col_character(),
  baseMean = col_double(),
  log2FoldChange = col_double(),
  lfcSE = col_double(),
  stat = col_double(),
  pvalue = col_double(),
  padj = col_double()
)
Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.


In [4]:
# Filter for log fold change of 1 (2x)
sig_dge_res_df <- dge_res_df %>%
    dplyr::filter(padj < 0.05) %>%
    dplyr::filter(abs(log2FoldChange) > log2(2))
nrow(sig_dge_res_df)

# Load and prep matrisome list

In [5]:
matrisome_df <- rutils::load_matrisome_df(matrisome_list)
head(matrisome_df)
nrow(matrisome_df)

Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


gene_symbol,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
ABI3BP,Core matrisome,ECM Glycoproteins,"ABI family, member 3 (NESH) binding protein",FLJ41743|FLJ41754|NESHBP|TARSH,17265,17265,B4DSV9:D3YTG3:E9PPR9:E9PRB5:H0Y897:H0YCG4:H0YCP4:H0YDN0:H0YDW0:H0YEA0:H0YEL2:H0YF18:H0YF57:H7C4H3:H7C4N5:H7C4S3:H7C4T1:H7C4X4:H7C524:H7C556:H7C5S3:Q5JPC9:Q7Z7G0,NP_056244.2:XP_005247340.1,Mouse:Abi3bp|,
ADIPOQ,Core matrisome,ECM Glycoproteins,"adiponectin, C1Q and collagen domain containing",ACDC|ACRP30|ADIPQTL1|ADPN|APM-1|APM1|GBP28|adipone,13633,13633,Q15848,NP_001171271.1:NP_004788.1,Mouse:Adipoq|,
AEBP1,Core matrisome,ECM Glycoproteins,AE binding protein 1,ACLP; FLJ33612,303,303,C9JLQ8:H7C0W8:H7C1J5:H7C391:H7C3D7:H7C4B5:Q8IUX7,NP_001120.3,Mouse:Aebp1|,
AGRN,Core matrisome,ECM Glycoproteins,agrin,FLJ45064,329,329,H0Y5U1:O00468,NP_940978.2:XP_005244806.1:XP_006710696.1,Mouse:Agrn|,
AMBN,Core matrisome,ECM Glycoproteins,ameloblastin (enamel matrix protein),-,452,452,Q9NP70,NP_057603.1,Mouse:Ambn|,
AMELX,Core matrisome,ECM Glycoproteins,"amelogenin (amelogenesis imperfecta 1, X-linked)",AIH1|ALGN|AMG|AMGL|AMGX,461,461,Q99217,NP_001133.1:NP_872621.1:NP_872622.1,Mouse:Amelx|,


# Left join significant DEGs with matrisome list

In [12]:
sig_dge_matrisome_lj_df <- sig_dge_res_df %>%
    left_join(matrisome_df, by = c("geneID" = "gene_symbol"), keep = TRUE) %>%
    mutate(in_matrisome = !is.na(gene_symbol)) %>%    # If gene symbol is NA, this gene isn't in matrisome list
    select(-gene_symbol) %>%    #We now have in_matrisome so this column isn't needed
    select(geneID:padj, in_matrisome, everything())

In [15]:
nrow(sig_dge_matrisome_lj_df) == nrow(sig_dge_res_df)
head(sig_dge_matrisome_lj_df)

geneID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,in_matrisome,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
RADIL,104.93138,-3.169362,0.4553518,-6.960249,3.396713e-12,2.224148e-11,False,,,,,,,,,,
AP1M2,3876.50373,2.727729,0.2544612,10.719624,8.233752000000001e-27,2.278681e-25,False,,,,,,,,,,
KLF1,6.54729,2.910308,0.5879601,4.94984,7.42746e-07,2.490286e-06,False,,,,,,,,,,
USHBP1,83.24046,-3.733287,0.2858682,-13.059467,5.6119640000000006e-39,3.405286e-37,False,,,,,,,,,,
SGCA,61.36091,-5.340272,0.4590674,-11.63287,2.805043e-31,1.034525e-29,False,,,,,,,,,,
IFI35,2560.83804,1.251394,0.271408,4.610749,4.01221e-06,1.207698e-05,False,,,,,,,,,,


## Counts of significant DEGs in matrisome list
- Get significant DEG counts for each category of matrisome list

In [28]:
matrisome_sig_dge_category_counts_df <- sig_dge_matrisome_lj_df %>%
    dplyr::count(category) %>%
    dplyr::rename(n_sig_degs = n) %>%
    dplyr::filter(!is.na(category))

n_sig_mat_genes <- sum(matrisome_sig_dge_category_counts_df$n_sig_degs)

matrisome_sig_dge_category_counts_df
paste0("Total significant DEGs in matrisome list: ", n_sig_mat_genes)

category,n_sig_degs
<chr>,<int>
Collagens,28
ECM Glycoproteins,113
ECM Regulators,123
ECM-affiliated Proteins,90
Proteoglycans,24
Secreted Factors,181


# Left join matrisome list with all DGE results and track expression/presence in dataset

In [59]:
all_genes_in_dset <- counts_df$geneID
matrisome_lj_df <- matrisome_df %>%
    # Which matrisome genes were in the dataset?
    mutate(
        in_unified_dataset = dplyr::case_when(
            gene_symbol %in% all_genes_in_dset ~ TRUE,
            !(gene_symbol %in% all_genes_in_dset) ~ FALSE
        )
    ) %>%
    # Which matrisome genes were lowly expressed?
    left_join(dge_res_df, by = c("gene_symbol" = "geneID"), keep = TRUE) %>%
    mutate(lowly_expressed = is.na(geneID)) %>%
    dplyr::select(-geneID) %>%
    dplyr::select(gene_symbol, in_unified_dataset, lowly_expressed, everything()) %>%
    dplyr::rename(geneID = gene_symbol)

In [60]:
head(matrisome_lj_df)

geneID,in_unified_dataset,lowly_expressed,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ABI3BP,True,False,Core matrisome,ECM Glycoproteins,"ABI family, member 3 (NESH) binding protein",FLJ41743|FLJ41754|NESHBP|TARSH,17265,17265,B4DSV9:D3YTG3:E9PPR9:E9PRB5:H0Y897:H0YCG4:H0YCP4:H0YDN0:H0YDW0:H0YEA0:H0YEL2:H0YF18:H0YF57:H7C4H3:H7C4N5:H7C4S3:H7C4T1:H7C4X4:H7C524:H7C556:H7C5S3:Q5JPC9:Q7Z7G0,NP_056244.2:XP_005247340.1,Mouse:Abi3bp|,,681.5641,-3.396137,0.4909099,-6.918045,4.579185e-12,2.956389e-11
ADIPOQ,True,True,Core matrisome,ECM Glycoproteins,"adiponectin, C1Q and collagen domain containing",ACDC|ACRP30|ADIPQTL1|ADPN|APM-1|APM1|GBP28|adipone,13633,13633,Q15848,NP_001171271.1:NP_004788.1,Mouse:Adipoq|,,,,,,,
AEBP1,True,False,Core matrisome,ECM Glycoproteins,AE binding protein 1,ACLP; FLJ33612,303,303,C9JLQ8:H7C0W8:H7C1J5:H7C391:H7C3D7:H7C4B5:Q8IUX7,NP_001120.3,Mouse:Aebp1|,,11665.9238,-2.442388,0.3669784,-6.655399,2.825329e-11,1.658899e-10
AGRN,True,False,Core matrisome,ECM Glycoproteins,agrin,FLJ45064,329,329,H0Y5U1:O00468,NP_940978.2:XP_005244806.1:XP_006710696.1,Mouse:Agrn|,,23197.2582,2.056296,0.2456779,8.369886,5.767408e-17,6.502725e-16
AMBN,True,True,Core matrisome,ECM Glycoproteins,ameloblastin (enamel matrix protein),-,452,452,Q9NP70,NP_057603.1,Mouse:Ambn|,,,,,,,
AMELX,True,True,Core matrisome,ECM Glycoproteins,"amelogenin (amelogenesis imperfecta 1, X-linked)",AIH1|ALGN|AMG|AMGL|AMGX,461,461,Q99217,NP_001133.1:NP_872621.1:NP_872622.1,Mouse:Amelx|,,,,,,,


## Some statistics about the matrisome genes

In [71]:
missing_matrisome_genes_mask <- matrisome_lj_df$in_unified_dataset == FALSE
missing_matrisome_genes <- matrisome_lj_df[missing_matrisome_genes_mask, ]$geneID
le_matrisome_genes_mask <- matrisome_lj_df$lowly_expressed == TRUE
le_matrisome_genes <- matrisome_lj_df[le_matrisome_genes_mask, ]$geneID

# Missing matrisome genes
length(missing_matrisome_genes)
missing_matrisome_genes
# Lowly expressed matrisome genes
length(le_matrisome_genes)
le_matrisome_genes

# Prop of total genes which are DE
nrow(sig_dge_res_df) / nrow(counts_df)
# Prop of matrisome genes (present in dset) which are DE
n_sig_mat_genes / nrow(matrisome_df[!missing_matrisome_genes_mask, ])

# Save Data

* Significant DEGs left joined with matrisome list
* Matrisome significant DEG counts (by category)
* Matrisome list summary (left joined with DGE results and examined for presence in unified data set)

In [73]:
write_tsv(sig_dge_matrisome_lj_df, paste0(dirs$analysis_dir, "/", dsets[dset_idx], "_sig_DESeq_results_xref_matrisome.tsv"))
write_tsv(matrisome_sig_dge_category_counts_df, paste0(dirs$analysis_dir, "/", dsets[dset_idx], "_matrisome_sig_DEG_category_counts.tsv"))
write_tsv(matrisome_lj_df, paste0(dirs$analysis_dir, "/", dsets[dset_idx], "_matrisome_summary.tsv"))