In [1]:
library(tidyverse)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


# Constants

In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
dsets <- c("unified_cervical_data")
dset_paths <- unlist(map(dsets, function(d) paste0(dirs$data_dir, "/", d)))
matrisome_list <- matrisome_list <- paste(dirs$data_dir, "matrisome", "matrisome_hs_masterlist.tsv", sep = "/")
dset_idx <- 1

In [28]:
counts_df <- read_tsv(paste0(dirs$data_dir, "/", dsets[dset_idx], "/counts.tsv")) %>%
    dplyr::rename(geneID = Hugo_Symbol) %>%
    dplyr::select(-Entrez_Gene_Id) %>%
    mutate_if(is.numeric, round, 0)
    
coldata_df <- read_tsv(paste0(dirs$data_dir, "/", dsets[dset_idx], "/coldata.tsv"))

Parsed with column specification:
cols(
  .default = col_double(),
  Hugo_Symbol = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)


# Two proportion Z-test

In [29]:
gtex_names <- (coldata_df %>%
    dplyr::filter(condition == "healthy" & data_source == "GTEx"))$sample_name
healthy_tcga_names <- (coldata_df %>%
    dplyr::filter(condition == "healthy" & data_source == "TCGA"))$sample_name

In [56]:
n = nrow(counts_df)
# healthy_gtex_df <- counts_df[, c("geneID", gtex_names)]
healthy_tcga_df <- counts_df[, c("geneID", healthy_tcga_names)]
healthy_tcga_zeros <- sum(rowSums(healthy_tcga_df[, -1]) == 0)

healthy_gtex_est_zeros_vec <- c()
for (i in 1:1e4) {
    healthy_gtex_df <- counts_df[, c("geneID", sample(gtex_names, size = 2, replace = FALSE))]
    healthy_gtex_est_zeros_vec <- c(healthy_gtex_est_zeros_vec, sum(rowSums(healthy_gtex_df[, -1]) == 0))
    
}
healthy_gtex_est_zeros <- round(mean(healthy_gtex_est_zeros_vec), 0)
healthy_gtex_est_zeros

In [57]:
prop.test(c(healthy_tcga_zeros, healthy_gtex_est_zeros), c(n, n))


	2-sample test for equality of proportions with continuity correction

data:  c(healthy_tcga_zeros, healthy_gtex_est_zeros) out of c(n, n)
X-squared = 58.994, df = 1, p-value = 1.582e-14
alternative hypothesis: two.sided
95 percent confidence interval:
 0.01770323 0.02992053
sample estimates:
    prop 1     prop 2 
0.12059085 0.09677897 
