In [54]:
library(tidyverse)
library(matrixStats)

# Custom package
library(rutils)


Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count



In [75]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
dge_res_df <- read_tsv(paste0(dirs$analysis_dir, "/unified_cervical_data_unfiltered_DESeq_results.tsv"))
counts_df <- read_tsv(paste0(dirs$data_dir, "/unified_cervical_data/counts.tsv")) %>%
    dplyr::select(-Entrez_Gene_Id) %>%
    mutate_if(is.numeric, round, 0)
coldata_df <- read_tsv(paste0(paste0(dirs$data_dir, "/unified_cervical_data/coldata.tsv")))
matrisome_genes_df <- rutils::load_matrisome_df(paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")) %>%
    dplyr::select(gene_symbol)

Parsed with column specification:
cols(
  geneID = col_character(),
  baseMean = col_double(),
  log2FoldChange = col_double(),
  lfcSE = col_double(),
  stat = col_double(),
  pvalue = col_double(),
  padj = col_double()
)
Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


In [33]:
# In figures, there's a pretty clear cut-off at LFC = 15
lfc_outlier_genes_df <- dge_res_df %>%
    dplyr::filter(abs(log2FoldChange) > 15) %>%
    dplyr::select(geneID)

In [46]:
nrow(counts_df %>%
    inner_join(lfc_outlier_genes_df, by = c("Hugo_Symbol" = "geneID")))

In [100]:
unexpressed_mask <- !(rowSums(counts_df[, -1] > 0) == 0)
expressed_counts_df <- counts_df[unexpressed_mask, ]

# n genes removed
sum(!unexpressed_mask)

In [47]:
nrow(expressed_counts_df %>%
    inner_join(lfc_outlier_genes_df, by = c("Hugo_Symbol" = "geneID")))

In [101]:
expr_med <- median(as.matrix(expressed_counts_df[, -1]))
lowly_expressed_mask <- !(rowSums(expressed_counts_df[, -1] <= expr_med) > (0.95 * nrow(coldata_df)))
prefiltered_counts_df <- expressed_counts_df[lowly_expressed_mask, ]

# n genes removed
sum(!lowly_expressed_mask)

In [102]:
prefiltered_counts_df %>%
    inner_join(lfc_outlier_genes_df, by = c("Hugo_Symbol" = "geneID")) %>%
    dplyr::filter(Hugo_Symbol %in% matrisome_genes_df$gene_symbol)

Hugo_Symbol,GTEX-S32W-1626-SM-4AD6G,GTEX-S32W-1526-SM-4AD6Z,GTEX-T5JW-0726-SM-4DM6D,GTEX-TSE9-2826-SM-4DXTF,GTEX-TSE9-2726-SM-4DXSQ,GTEX-TML8-0726-SM-4DXTT,GTEX-S341-1126-SM-4AD6T,GTEX-T6MO-1426-SM-4DM73,GTEX-S4UY-1426-SM-4AD6Y,⋯,TCGA-VS-A9UV-01A-11R-A42T-07,TCGA-MA-AA43-01A-11R-A42T-07,TCGA-VS-A9UJ-01A-11R-A42T-07,TCGA-C5-A7CM-01A-11R-A33Z-07,TCGA-EA-A3QD-01A-32R-A22U-07,TCGA-EA-A3HR-01A-11R-A213-07,TCGA-VS-A8EK-01A-12R-A37O-07,TCGA-VS-A9UM-01A-11R-A42T-07,TCGA-C5-A1MN-01A-11R-A14Y-07,TCGA-Q1-A6DT-01A-11R-A32P-07
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
RPTN,8,6,718,1,1,0,8,0,6,⋯,0,894,18,0,21,210,95,2,677,0
CRNN,54,39,91780,51,46,37,42,32,52,⋯,6,12704,0,2,228,7978,237,571,49,2
EPGN,0,0,272,1,0,1,4,0,2,⋯,72,213,3,0,0,27,4,18,6,2125


In [120]:
expr_vars <- rowVars(as.matrix(counts_df[, -1]))
var_df <- counts_df %>%
    mutate(expr_vars = expr_vars) %>%
    dplyr::select(Hugo_Symbol, expr_vars, everything())

In [121]:
var_df %>%
    inner_join(lfc_outlier_genes_df, by = c("Hugo_Symbol" = "geneID"))

Hugo_Symbol,expr_vars,GTEX-S32W-1626-SM-4AD6G,GTEX-S32W-1526-SM-4AD6Z,GTEX-T5JW-0726-SM-4DM6D,GTEX-TSE9-2826-SM-4DXTF,GTEX-TSE9-2726-SM-4DXSQ,GTEX-TML8-0726-SM-4DXTT,GTEX-S341-1126-SM-4AD6T,GTEX-T6MO-1426-SM-4DM73,⋯,TCGA-VS-A9UV-01A-11R-A42T-07,TCGA-MA-AA43-01A-11R-A42T-07,TCGA-VS-A9UJ-01A-11R-A42T-07,TCGA-C5-A7CM-01A-11R-A33Z-07,TCGA-EA-A3QD-01A-32R-A22U-07,TCGA-EA-A3HR-01A-11R-A213-07,TCGA-VS-A8EK-01A-12R-A37O-07,TCGA-VS-A9UM-01A-11R-A42T-07,TCGA-C5-A1MN-01A-11R-A14Y-07,TCGA-Q1-A6DT-01A-11R-A32P-07
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
FGB,2482407.399,60,48,3,21,34,18,264,1,⋯,0,6,21,5,0,56,34,0,0,2
CRCT1,6017400.672,3,4,10333,3,4,6,2,5,⋯,133,1529,3,2,60,1537,619,239,404,83
FGG,150281.100,66,64,1,21,27,8,258,1,⋯,0,0,0,0,3,12,14,0,0,1
PLA2G3,242415.807,0,3,728,0,2,0,1,18,⋯,825,179,105,5,229,131,123,66,179,0
AL353354.2,2001.578,0,0,8,3,0,6,0,0,⋯,0,0,0,0,267,0,0,0,0,0
TMPRSS11BNL,5502.471,0,0,287,0,0,0,0,0,⋯,28,7,0,1,0,162,5,20,6,0
PGLYRP4,409251.696,1,0,1115,2,1,1,0,0,⋯,206,198,7,18,43,1232,490,438,231,88
GP2,161295.004,68,65,57,54,62,30,77,101,⋯,0,17,0,1884,0,0,0,9,0,0
CDX2,2581623.260,1,0,1,0,3,0,1,0,⋯,0,0,0,3,1,0,0,1,0,0
RPTN,1101781.016,8,6,718,1,1,0,8,0,⋯,0,894,18,0,21,210,95,2,677,0


In [123]:
healthy_coldata_df <- coldata_df %>%
    dplyr::filter(condition == "healthy")

In [125]:
healthy_counts_df <- counts_df %>%
    dplyr::select(c("Hugo_Symbol", healthy_coldata_df$sample_name))

In [128]:
healthy_lfc_outlier_counts_df <- healthy_counts_df %>%
    inner_join(lfc_outlier_genes_df, by = c("Hugo_Symbol" = "geneID"))

In [132]:
head(healthy_lfc_outlier_counts_df)
dim(healthy_lfc_outlier_counts_df)

Hugo_Symbol,GTEX-S32W-1626-SM-4AD6G,GTEX-S32W-1526-SM-4AD6Z,GTEX-T5JW-0726-SM-4DM6D,GTEX-TSE9-2826-SM-4DXTF,GTEX-TSE9-2726-SM-4DXSQ,GTEX-TML8-0726-SM-4DXTT,GTEX-S341-1126-SM-4AD6T,GTEX-T6MO-1426-SM-4DM73,GTEX-S4UY-1426-SM-4AD6Y,GTEX-U3ZN-1626-SM-4DXTZ,GTEX-S341-1326-SM-4AD72,TCGA-HM-A3JJ-11A-12R-A21T-07,TCGA-FU-A3EO-11A-13R-A213-07
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
FGB,60,48,3,21,34,18,264,1,85,22,52,0,0
CRCT1,3,4,10333,3,4,6,2,5,17,34165,5,0,0
FGG,66,64,1,21,27,8,258,1,85,26,45,0,0
PLA2G3,0,3,728,0,2,0,1,18,1,1546,8,0,0
AL353354.2,0,0,8,3,0,6,0,0,0,4,0,0,0
TMPRSS11BNL,0,0,287,0,0,0,0,0,0,244,0,0,0


In [135]:
colSums(as.matrix(healthy_lfc_outlier_counts_df[, -1]))

In [149]:
# Of our outlier genes, how many are unexpressed in
# ALL (GTEx + TCGA) samples
sum(rowSums(as.matrix(healthy_lfc_outlier_counts_df[, -1])) == 0)

In [145]:
healthy_counts_df %>%
    dplyr::select(Hugo_Symbol, starts_with("TCGA")) %>%
    mutate(rsums = rowSums(.[, 2:3])) %>%
    dplyr::filter(rsums == 0) %>%
    dplyr::select(Hugo_Symbol, rsums, everything())

Hugo_Symbol,rsums,TCGA-HM-A3JJ-11A-12R-A21T-07,TCGA-FU-A3EO-11A-13R-A213-07
<chr>,<dbl>,<dbl>,<dbl>
SLC26A8,0,0,0
OR5H14,0,0,0
KLHL1,0,0,0
OR11G2,0,0,0
OR5A2,0,0,0
NMS,0,0,0
LL22NC03-63E9.3,0,0,0
SPERT,0,0,0
CFHR2,0,0,0
OR51B5,0,0,0
