In [3]:
source("~/software/notebook_assist/functions.R")
source("~/software/notebook_assist/packages.R")
setwd("/projects/CARDIPS/analysis/epigenome_resource/")

theme_bw2 = theme_bw() +
    theme(
        axis.text = element_text(size = 10),
        strip.text = element_text(size = 10),
        axis.title = element_text(size = 10),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        legend.position = "top"
    )

bedtools="/frazer01/software/bedtools-2.25.0/bin/bedtools"

all_modules = fread("analyses/tim/ld_modules/modules/all_modules_091124.txt",sep="\t",data.table=F)
gene_info = fread("eqtls/iPSC/input/phenotype_info.txt",sep="\t",data.table=F) %>% 
                mutate(tss_start = ifelse(strand == "+",start-3,end),
                      tss_end    = ifelse(strand == "+",start,end+3)) %>%
                select(chrom,tss_start,tss_end, gene_id) 
colnames(gene_info) = paste0("V",1:4)

atac_info = as.data.frame(rbindlist(lapply(c("iPSC","CVPC","PPC"), function(tissue) {
    df = fread(paste("caqtls",tissue,"input/phenotype_info.bed",sep="/"),sep="\t")
    return(df)
})))

chip_info = as.data.frame(rbindlist(lapply(c("iPSC","CVPC"), function(tissue) {
    df = fread(paste("haqtls",tissue,"input/phenotype_info.bed",sep="/"),sep="\t")
    return(df)
})))

pheno_info = as.data.frame(rbindlist(list(gene_info, atac_info, chip_info)))


cvpc_expressed = readRDS("eqtls/CVPC/step_4/qtl_by_element/qtl_input.rds")$phenotype_info
cvpc_expressed2 = gene_info[ gene_info$V4 %in% cvpc_expressed$element_id, ]
colnames(pheno_info) = c("Chromosome","Start","End","Element_ID")

primary_qtls = fread("analyses/tim/ld_modules/scripts/filtered_qtls_09042024.txt",sep="\t",data.table=F) %>% filter(type == 0)
primary_qtls$snpid = gsub("VAR_","",primary_qtls$id)
primary_qtls$chrom = as.numeric(str_extract(primary_qtls$snpid, "^[0-9]+"))
primary_qtls2 = primary_qtls %>% select(element_id,snp_id,pos, tissue) %>% rename(`Element_ID` = element_id, Tissue = tissue)


all_modules2 = merge(merge(all_modules,primary_qtls2,by=c("Element_ID","Tissue")),pheno_info)

cvpc_modules = all_modules2 %>% filter(Tissue == "CVPC")
cvpc_phenos = pheno_info[ pheno_info$Element_ID %in% cvpc_modules$Element_ID,]

# cvpc_chrom_phenos = cvpc_phenos#[ !grepl("^E",cvpc_phenos$Element_ID),] %>% arrange(Chromosome, Start, End)

egene_path <- tempfile()
qpeak_path <- tempfile()

write.table(cvpc_expressed2 %>% arrange(V1,V2,V3), egene_path, sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
write.table(cvpc_phenos %>% arrange(Chromosome, Start, End), qpeak_path, sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

cmd <- sprintf('%s closest -a %s -b %s -d', bedtools, qpeak_path, egene_path)

result <- fread(cmd = cmd)

unlink(c(egene_path, qpeak_path))

cvpc_modules2 = merge(cvpc_modules,result[,4:8],by.x="Element_ID",by.y="V4")
cvpc_modules3 = mutate(cvpc_modules2, min_distance = ifelse(
  pos >= V6 & pos <= V7,
  0,
  pmin(abs(pos - V6), abs(pos - V7))
))

cvpc_modules_eqtls = cvpc_modules[ cvpc_modules$qElement_Type == "eGene",]
cvpc_modules_eqtls2 = mutate(cvpc_modules_eqtls, min_distance = ifelse(
  pos >= Start & pos <= End,
  0,
  pmin(abs(pos - Start), abs(pos - End))
))

cvpc_modules4 = unique(cvpc_modules3[,c("qElement_Type","Complexity","V8","min_distance")])

In [7]:
cvpc_modules5 = cvpc_modules4 %>% group_by(qElement_Type) %>% 
                    filter(ifelse(qElement_Type == "eGene", min_distance < 1000000, min_distance < 100000))
max(cvpc_modules5$min_distance)

fwrite(cvpc_modules5, "~/projects/Resource/notebooks/toGitHub/sourcedata/SOURCEDATA.FIGURE4E.txt",
       sep="\t",row.names=F,quote=F)


In [5]:
library(purrr)
results <- cvpc_modules4 %>%
  group_by(qElement_Type) %>%
  summarise(
    test_result = list(
      wilcox.test(
        min_distance[Complexity == "Complex"],
        min_distance[Complexity == "Singleton"],
        alternative = "two.sided"
      )
    ),
    .groups = "drop"
  ) %>%
  mutate(
    p_value = map_dbl(test_result, ~ .x$p.value),
    statistic = map_dbl(test_result, ~ .x$statistic)
  )
results


Attaching package: ‘purrr’


The following object is masked from ‘package:data.table’:

    transpose




qElement_Type,test_result,p_value,statistic
<chr>,<list>,<dbl>,<dbl>
caPeak,"9651931 , 2.39687569513366e-44 , 0 , two.sided , Wilcoxon rank sum test with continuity correction , min_distance[Complexity == ""Complex""] and min_distance[Complexity == ""Singleton""]",2.396876e-44,9651931
eGene,"1818676.5 , 3.51156270143745e-27 , 0 , two.sided , Wilcoxon rank sum test with continuity correction , min_distance[Complexity == ""Complex""] and min_distance[Complexity == ""Singleton""]",3.511563e-27,1818676
haPeak,"8149807 , 9.23323159016615e-13 , 0 , two.sided , Wilcoxon rank sum test with continuity correction , min_distance[Complexity == ""Complex""] and min_distance[Complexity == ""Singleton""]",9.233232e-13,8149807
