In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


# Constants

In [2]:
data_dir <- "../../../../../mnt/d/unified_TCGA_GTEx"
dsets <- c("unified_cervical_data")
dset_paths <- unlist(map(dsets, function(d) paste0(data_dir, "/", d)))
analysis_dir <- paste(data_dir, "analysis", sep = "/")
matrisome_list <- paste(data_dir, "matrisome", "matrisome_hs_masterlist.tsv", sep = "/")
dset_idx <- 1

## Co-process matrisome and unified dataset genes

## Load DESeq results

In [6]:
dge_res_informed_df <- read_tsv(paste0(analysis_dir, "/", dsets[dset_idx], "_unfiltered_DESeq_results.tsv"))
sig_dge_res_informed_df <- dplyr::filter(dge_res_informed_df, padj < 0.05)

Parsed with column specification:
cols(
  geneID = col_character(),
  baseMean = col_double(),
  log2FoldChange = col_double(),
  lfcSE = col_double(),
  stat = col_double(),
  pvalue = col_double(),
  padj = col_double()
)


### Load matrisome list and prep

In [7]:
matrisome_df <- read_tsv(matrisome_list, quote = "")
colnames(matrisome_df) <- map(sub(" ", "_", colnames(matrisome_df)), tolower)
matrisome_df <- select(matrisome_df, gene_symbol, everything()) %>%
    dplyr::filter(division != "Retired")    # Ignore "Retired" matrisome genes
head(matrisome_df)
nrow(matrisome_df)

Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


gene_symbol,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
ABI3BP,Core matrisome,ECM Glycoproteins,"ABI family, member 3 (NESH) binding protein",FLJ41743|FLJ41754|NESHBP|TARSH,17265,17265,B4DSV9:D3YTG3:E9PPR9:E9PRB5:H0Y897:H0YCG4:H0YCP4:H0YDN0:H0YDW0:H0YEA0:H0YEL2:H0YF18:H0YF57:H7C4H3:H7C4N5:H7C4S3:H7C4T1:H7C4X4:H7C524:H7C556:H7C5S3:Q5JPC9:Q7Z7G0,NP_056244.2:XP_005247340.1,Mouse:Abi3bp|,
ADIPOQ,Core matrisome,ECM Glycoproteins,"adiponectin, C1Q and collagen domain containing",ACDC|ACRP30|ADIPQTL1|ADPN|APM-1|APM1|GBP28|adipone,13633,13633,Q15848,NP_001171271.1:NP_004788.1,Mouse:Adipoq|,
AEBP1,Core matrisome,ECM Glycoproteins,AE binding protein 1,ACLP; FLJ33612,303,303,C9JLQ8:H7C0W8:H7C1J5:H7C391:H7C3D7:H7C4B5:Q8IUX7,NP_001120.3,Mouse:Aebp1|,
AGRN,Core matrisome,ECM Glycoproteins,agrin,FLJ45064,329,329,H0Y5U1:O00468,NP_940978.2:XP_005244806.1:XP_006710696.1,Mouse:Agrn|,
AMBN,Core matrisome,ECM Glycoproteins,ameloblastin (enamel matrix protein),-,452,452,Q9NP70,NP_057603.1,Mouse:Ambn|,
AMELX,Core matrisome,ECM Glycoproteins,"amelogenin (amelogenesis imperfecta 1, X-linked)",AIH1|ALGN|AMG|AMGL|AMGX,461,461,Q99217,NP_001133.1:NP_872621.1:NP_872622.1,Mouse:Amelx|,


### Left Join significant DEGs with matrisome list
- Keep all significant DEGs
- Fill matrisome columns with data from matching matrisome genes (fill with NA if no match)

In [8]:
sig_dge_matrisome_left_join_df <- left_join(sig_dge_res_informed_df, matrisome_df, by = c("geneID" = "gene_symbol"), keep = TRUE) %>%
    mutate(in_matrisome = !is.na(gene_symbol)) %>%    # If gene_symbol is NA, this gene isn't in the matrisome list
    select(-gene_symbol) %>%    # We now have in_matrisome, so this column isn't needed anymore
    select(geneID:padj, in_matrisome, everything())

In [9]:
nrow(sig_dge_res_informed_df) == nrow(sig_dge_matrisome_left_join_df)    # This must be true since we want to retain all DEGs
head(sig_dge_matrisome_left_join_df)

geneID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,in_matrisome,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
RADIL,104.931376,-3.023204,1.1379214,-2.656777,0.007889156,0.0374763,False,,,,,,,,,,
AP1M2,3874.620691,6.494125,0.6262721,10.369495,3.4132260000000002e-25,6.901612e-23,False,,,,,,,,,,
KLF1,6.54729,4.035969,1.5879516,2.541619,0.01103403,0.04838536,False,,,,,,,,,,
USHBP1,83.240457,-4.113287,0.7125654,-5.772504,7.810209e-09,2.489566e-07,False,,,,,,,,,,
NKPD1,151.738326,4.604885,1.1473048,4.013654,5.978591e-05,0.0006941963,False,,,,,,,,,,
SLC26A8,7.200758,5.513994,1.8815845,2.930506,0.003384109,0.01933554,False,,,,,,,,,,


### Counts of significant DEGs in matrisome list
- Get significant DEG counts for each category of matrisome list

In [10]:
matrisome_sig_DEG_category_counts_df <- sig_dge_matrisome_left_join_df %>%
    dplyr::count(category) %>%
    dplyr::rename(n_sig_DEGs = n) %>%
    filter(!is.na(category))
matrisome_sig_DEG_category_counts_df
paste0("Total matrisome genes which are significant DEGs: ", sum(matrisome_sig_DEG_category_counts_df$n_sig_DEGs))

category,n_sig_DEGs
<chr>,<int>
Collagens,8
ECM Glycoproteins,56
ECM Regulators,74
ECM-affiliated Proteins,54
Proteoglycans,10
Secreted Factors,110


### Right join DESeq result data with matrisome list
- Want all matrisome genes listed
- NA in DESeq columns if the matrisome gene was not found in the original unified (TCGA + GTEx) dataset

In [11]:
all_dge_matrisome_right_join_df <- right_join(dge_res_informed_df, matrisome_df, by = c("geneID" = "gene_symbol"), keep = TRUE) %>%
    mutate(in_unified_dataset = !is.na(geneID)) %>%    # If geneID is NA, this gene isn't in the RNA-Seq dataset
    select(-geneID) %>%    # We now have in_unified_dataset, so this column isn't needed anymore
    select(gene_symbol, baseMean:padj, in_unified_dataset, everything()) %>%
    dplyr::rename(geneID = gene_symbol)

# Length of joined dataframe should match length of matrisome list
nrow(all_dge_matrisome_right_join_df) == nrow(matrisome_df)
head(all_dge_matrisome_right_join_df)

geneID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,in_unified_dataset,division,category,gene_name,synonyms,hgnc_ids,hgnc_ids_links,uniprot_ids,refseq_ids,orthology,notes
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
PGF,1439.047628,1.7961389,1.0761235,1.6690825,0.0951010364,0.239493338,True,Matrisome-associated,Secreted Factors,placental growth factor,D12S1900|PGFL|PLGF|PlGF-2|SHGC-10760,8893,8893,G3XA84:P49763,NP_001193941.1:NP_002623.2:XP_005267818.1,Mouse:Pgf|,
TIMP4,49.5796,-2.5000604,1.5535274,-1.6092799,0.1075551535,0.261608635,True,Matrisome-associated,ECM Regulators,TIMP metallopeptidase inhibitor 4,-,11823,11823,Q99727,NP_003247.1,Mouse:Timp4|,
C1QTNF6,1663.611171,0.8317585,0.8347062,0.9964687,0.3190224832,0.544757491,True,Matrisome-associated,ECM-affiliated Proteins,C1q and tumor necrosis factor related protein 6,CTRP6|ZACRP6,14343,14343,F8WC87:Q9BXI9,NP_114116.3:NP_872292.1:XP_005261382.1:XP_006724188.1,Mouse:C1qtnf6|,
TNC,14887.584519,0.1505524,1.1706007,0.1286113,0.8976652543,0.988316476,True,Core matrisome,ECM Glycoproteins,tenascin C,150-225|GMEM|GP|HXB|JI|MGC167029|TN,5318,5318,E9PC84:F5H5D6:F5H7V9:H0YGZ3:J3QSU6:P24821,NP_002151.2:XP_005252029.1:XP_005252031.1:XP_005252032.1:XP_006717161.1:XP_006717163.1,Mouse:Tnc|,
PRL,4.336559,-4.5906659,2.3620699,-1.9434928,0.0519566391,0.155745433,True,Matrisome-associated,Secreted Factors,prolactin,-,9445,9445,P01236,NP_000939.1:NP_001157030.1,Mouse:Prl|,
OGN,586.195062,-7.1447904,1.8936657,-3.7729946,0.0001612998,0.001601637,True,Core matrisome,Proteoglycans,osteoglycin,DKFZp586P2421|OG|OIF|SLRR3A,8126,8126,P20774:Q5TBF5,NP_054776.1:NP_148935.1,Mouse:Ogn|,


### Matrisome genes missing from RNA-Seq (TCGA + GTEx) dataset?

In [12]:
missing_mask <- all_dge_matrisome_right_join_df$in_unified_dataset == FALSE
missing_matrisome_genes <- all_dge_matrisome_right_join_df$geneID[missing_mask]
paste0("Number of matrisome genes missing from RNA-Seq dataset: ", length(missing_matrisome_genes))
missing_matrisome_genes

## Save Data
- Unfiltered `DESeq` results
- Significant DEGs left-joined with matrisome list
- Matrisome significant DEG counts (by category)
- All unified dataset genes right-joined with matrisome list

In [13]:
write_tsv(sig_dge_matrisome_left_join_df, paste0(analysis_dir, "/", dsets[dset_idx], "_stat_sig_DESeq_results_xref_matrisome.tsv"))
write_tsv(matrisome_sig_DEG_category_counts_df, paste0(analysis_dir, "/", dsets[dset_idx], "_matrisome_stat_sig_DEG_category_counts.tsv"))
write_tsv(all_dge_matrisome_right_join_df, paste0(analysis_dir, "/", dsets[dset_idx], "_all_matrisome_xref_unified_data.tsv"))