In [1]:
setwd("/projects/CARDIPS/analysis/epigenome_resource")
suppressMessages(suppressWarnings(source("analyses/jennifer/notebooks/functions.R")))

set.seed(5366)

# Identification of EDev-unique QTLs

    fetal-unique = EDev-unique

## 1. Get the lead fine-mapped variants for each iPSCORE QTL

In [5]:
# Already ran, do not run again!
finemapped = as.data.frame(rbindlist(lapply(c("CVPC", "iPSC", "PPC"), function(tiss)
{
    as.data.frame(rbindlist(lapply(c("eqtls", "caqtls", "haqtls"), function(analysis)
    {
        message(paste(tiss, analysis), appendLF = F)
        dir = paste("analyses/tim/qtl_finemapping", tiss, analysis, sep = "/")
        if (dir.exists(dir))
        {
            files = list.files(dir, full.names = T)
            files = files[which(files %like% "all_vars.")]
            as.data.frame(rbindlist(lapply(files, function(f)
            {
                fread(f, data.table = F) %>%
                    mutate(element_cond = paste(element, type, sep = "_")) %>%
                    group_by(element_cond) %>%
                    filter(SNP.PP == max(SNP.PP)) %>%
                    mutate(tissue = tiss, 
                           analysis = analysis,
                           id = unlist(lapply(snp, function(x) { paste0(unlist(strsplit(x, "_"))[2:3], collapse = "_")})),
                           chr = unlist(lapply(snp, function(x) { unlist(strsplit(x, "_"))[2] }))
                          )
            })))
        }
    }))) 
})))

fwrite(finemapped, "analyses/jennifer/summary_files/top_finemapped.txt", row.names = F, sep = "\t")

CVPC eqtls
CVPC caqtls
CVPC haqtls
iPSC eqtls
iPSC caqtls
iPSC haqtls
PPC eqtls
PPC caqtls
PPC haqtls


In [6]:
finemapped = fread("analyses/jennifer/summary_files/top_finemapped.txt", data.table = F)
finemapped$element_cond = ifelse(finemapped$element_cond %like% "ENSG", 
                                 paste(tolower(finemapped$tissue), finemapped$element_cond, sep = "_"), finemapped$element_cond)


In [7]:
table(finemapped$tissue, finemapped$analysis)

      
       caqtls eqtls haqtls
  CVPC  31326 17832  29529
  iPSC  24917 33806   4944
  PPC   31545 21418      0

## 2.1. Prepare adult caQTLs  (QTLBase2)

In [10]:
# 1. Read metadata
qtlbase_meta = fread("analyses/jennifer/ipscore_unique_qtls/QTLbase_v2_download_meta_info.txt", data.table = F) %>% 
    filter(xQTL == "caQTL" & !Tissue %like% "iPSC") %>%
    filter(xQTL == "caQTL" & 
           !Tissue %like% "iPSC" & 
           Title != "Cell-type specific effects of genetic variation on chromatin accessibility during human neuronal differentiation" & 
           Title != "Cell-type-specific effects of genetic variation on chromatin accessibility during human neuronal differentiation") 

# 2. Read caQTLs
qtlbase_caqtls = fread("analyses/jennifer/ipscore_unique_qtls/QTLbase_caqtls.txt", data.table = F)  
head = fread(cmd = "head -1 analyses/jennifer/ipscore_unique_qtls/QTLbase_caqtls.txt", data.table = F)
colnames(qtlbase_caqtls) = colnames(head)

# 3. Filter SNPs with p-value <= 1e-05
qtlbase_caqtls = qtlbase_caqtls %>% filter(Pvalue <= 5e-05) %>% filter(Sourceid %in% qtlbase_meta$Sourceid)
summary(qtlbase_caqtls$Pvalue)

qtlbase_caqtls$id = paste(gsub("chr", "", qtlbase_caqtls$SNP_chr), qtlbase_caqtls$SNP_pos_hg38, sep = "_")
qtlbase_caqtls = qtlbase_caqtls %>% select(id) %>% mutate(tissue = "QTLbase", analysis = "caqtl")

head(qtlbase_caqtls,2)

message(paste("# caSNPs:", length(unique(qtlbase_caqtls$id))))

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.000e+00 6.000e-11 2.640e-07 6.579e-06 7.200e-06 5.000e-05 

Unnamed: 0_level_0,id,tissue,analysis
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,1_921203,QTLbase,caqtl
2,1_976215,QTLbase,caqtl


# caSNPs: 39863



## 2.2. Prepare adult haQTLs (QTLBase2)

In [11]:
# 1. Metadata
qtlbase_meta = fread("analyses/jennifer/ipscore_unique_qtls/QTLbase_v2_download_meta_info.txt", data.table = F) %>% 
    filter(xQTL == "hQTL" & !Describe %like% "me" & !Tissue %like% "iPSC")

# 2. haQTLs
qtlbase_hqtls = fread("analyses/jennifer/ipscore_unique_qtls/QTLbase_hQTL.txt", data.table = F)
head = fread(cmd = "head -1 analyses/jennifer/ipscore_unique_qtls/QTLbase_hQTL.txt", data.table = F)
colnames(qtlbase_hqtls) = colnames(head)

# 3. Filter p-value <= 1e-05
qtlbase_hqtls = qtlbase_hqtls %>% filter(Pvalue <= 1e-05 & Sourceid %in% qtlbase_meta$Sourceid)

qtlbase_hqtls$id = paste(gsub("chr", "", qtlbase_hqtls$SNP_chr), qtlbase_hqtls$SNP_pos_hg38, sep = "_")
qtlbase_hqtls = qtlbase_hqtls %>% select(id) %>% mutate(tissue = "QTLbase", analysis = "haqtl")
head(qtlbase_hqtls,2)

message(paste("# haSNPs:", length(unique(qtlbase_hqtls$id))))

Unnamed: 0_level_0,id,tissue,analysis
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,1_1419214,QTLbase,haqtl
2,1_1420203,QTLbase,haqtl


# haSNPs: 244563



## 2.3. Prepare adult GTEx eQTLs 

In [12]:
files = list.files("/reference/public/GTEX_v8/GTEx_Analysis_v8_eQTL", full.names = T)
files = files[which(files %like% "egenes.txt.gz")]

message(paste("# GTEx tissues:", length(unique(files))))

gtex_eqtls = as.data.frame(rbindlist(lapply(files, function(x) 
{
    fread(x, data.table = F) %>% 
        filter(qval <= 0.05) %>% 
        mutate(tissue = unlist(strsplit(unlist(strsplit(x, "/"))[6], "[.]"))[1], analysis = "eqtl")
    
}))) %>% select(chr, variant_pos, ref, alt, tissue, analysis) %>% mutate(id = paste(chr, variant_pos, ref, alt, sep = "_"))

head(gtex_eqtls,2)

message(paste("# GTEx variants:", length(unique(gtex_eqtls$id))))

gtex_eqtls = gtex_eqtls %>% 
    dplyr::rename(SNP_chr = chr, SNP_pos_hg38 = variant_pos) %>% 
    mutate(id = paste(SNP_chr, SNP_pos_hg38, sep = "_")) %>%
    mutate(id = gsub("chr", "", id))

gtex_eqtls = gtex_eqtls %>% select(id, tissue, analysis) 

head(gtex_eqtls,2)



# GTEx tissues: 49



Unnamed: 0_level_0,chr,variant_pos,ref,alt,tissue,analysis,id
Unnamed: 0_level_1,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>
1,chr1,64764,C,T,Adipose_Subcutaneous,eqtl,chr1_64764_C_T
2,chr1,108826,G,C,Adipose_Subcutaneous,eqtl,chr1_108826_G_C


# GTEx variants: 276116



Unnamed: 0_level_0,id,tissue,analysis
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,1_64764,Adipose_Subcutaneous,eqtl
2,1_108826,Adipose_Subcutaneous,eqtl


## 2.4. Prepare Vinuela adult islet eQTLs

In [7]:
str(islet_eqtls)

'data.frame':	4638 obs. of  22 variables:
 $ GeneName             : chr  "ACTR8" "ERC2" "CCDC66" "ARHGEF3" ...
 $ Strand               : chr  "-" "-" "+" "-" ...
 $ GencodeLevel         : int  1 2 2 1 2 1 2 1 2 2 ...
 $ GeneType             : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
 $ GeneID               : chr  "ENSG00000113812.9" "ENSG00000187672.8" "ENSG00000180376.12" "ENSG00000163947.7" ...
 $ ChrPheno             : int  3 3 3 3 3 3 3 3 3 3 ...
 $ StartPheno           : int  53916229 56502391 56591189 57113357 58223233 58318607 58419584 58613337 62936105 66551687 ...
 $ EndPheno             : int  53916229 56502391 56591189 57113357 58223233 58318607 58419584 58613337 62936105 66551687 ...
 $ NumSNPs              : int  5186 5997 5981 5666 4746 4823 5005 5134 6319 6635 ...
 $ DistanceWithBest     : int  6498 -232239 -134916 319635 -56828 41071 26010 45744 -6302 -204480 ...
 $ SNPid                : chr  "rs893367" "rs9841915" "rs11130540" "rs98

In [8]:
islet_eqtls = fread("/projects/PPC/analysis/ppc_eqtls/input/eqtl/PacreaticIslets_independent_gene_eQTLs.txt", data.table = F)
table(islet_eqtls$DiscoveryOrder)

nrow(islet_eqtls)

length(unique(islet_eqtls$SNPid))


   1    2    3    4 
4311  318    8    1 

In [13]:
islet_eqtls = fread("/projects/PPC/analysis/ppc_eqtls/input/eqtl/PacreaticIslets_independent_gene_eQTLs.txt", data.table = F)

# Lift over
bed = islet_eqtls %>% select(chrSNP, StartSNP, EndSNP) %>% mutate(chrSNP = paste0("chr", chrSNP)) %>% dplyr::rename(chrom = chrSNP, start = StartSNP, end = EndSNP)
bed = bed %>% filter(chrom != "chr23")

suppressMessages(library(rtracklayer  ))
suppressMessages(library(GenomicRanges))

path = "/reference/public/ucsc/hg19ToHg38.over.chain"
ch   = import.chain(path)

gr38 = makeGRangesFromDataFrame(bed, T)
gr19 = data.frame(liftOver(gr38, ch))

islet_eqtls = gr19 %>% 
    select(seqnames, start) %>% 
    mutate(chr = gsub("chr", "", seqnames), 
           id = paste(chr, start, sep = "_"),
           tissue = "Vinuela_Adult_Islets",
           analysis = "eqtl") %>%
    select(id, tissue, analysis)

head(islet_eqtls,2)

message(paste("# SNPs:", length(unique(islet_eqtls$id))))

Unnamed: 0_level_0,id,tissue,analysis
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,3_53875704,Vinuela_Adult_Islets,eqtl
2,3_56700602,Vinuela_Adult_Islets,eqtl


# SNPs: 4172



## 2.5 Prepare Khetan adult islet caQTLs

In [14]:
islet_caqtls = fread("analyses/jennifer/ipscore_unique_qtls/Khetan_Islet_caQTL.txt", data.table = F)

str(islet_caqtls)

# ~One SNP per Peak
nrow(islet_caqtls)
length(unique(islet_caqtls$Feature_Coordinates))

# liftover
bed = islet_caqtls %>% 
    mutate(chrom = paste0("chr", Chromosome),
           start = SNP_Position,
           end = SNP_Position) %>%
    select(chrom, start, end)

head(bed,2)

suppressMessages(library(rtracklayer  ))
suppressMessages(library(GenomicRanges))

path = "/reference/public/ucsc/hg19ToHg38.over.chain"
ch   = import.chain(path)

gr38 = makeGRangesFromDataFrame(bed, T)
gr19 = data.frame(liftOver(gr38, ch))

islet_caqtls = gr19 %>% 
    select(seqnames, start) %>% 
    mutate(chr = gsub("chr", "", seqnames), 
           id = paste(chr, start, sep = "_"),
           tissue = "Khetan_Adult_Islets",
           analysis = "caqtl") %>%
    select(id, tissue, analysis)

head(islet_caqtls,2)
    
message(paste("# SNPs:", length(unique(islet_caqtls$id))))

'data.frame':	2949 obs. of  36 variables:
 $ rsID                                                  : chr  "rs9274652" "rs6547757" "rs2517198" "rs6838211" ...
 $ Feature_Coordinates                                   : chr  "6:32636138-32636402" "2:88469578-88470466" "8:17465473-17466680" "4:178643909-178644842" ...
 $ Chromosome                                            : int  6 2 8 4 9 6 4 13 10 13 ...
 $ SNP_Position                                          : int  32636235 88469770 17465824 178644629 10208378 29920909 180337189 25592320 38383280 53174463 ...
 $ Ref                                                   : chr  "C" "G" "C" "A" ...
 $ Alt                                                   : chr  "T" "C" "T" "C" ...
 $ AlelleFreq                                            : num  0.289 0.816 0.737 0.342 0.684 ...
 $ HWE_ChiSq                                             : num  0.4362 0.9688 0.1396 1.5558 0.0125 ...
 $ IA                                                    : num  

Unnamed: 0_level_0,chrom,start,end
Unnamed: 0_level_1,<chr>,<int>,<int>
1,chr6,32636235,32636235
2,chr2,88469770,88469770


Unnamed: 0_level_0,id,tissue,analysis
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,6_32668458,Khetan_Adult_Islets,caqtl
2,2_88170251,Khetan_Adult_Islets,caqtl


# SNPs: 2948



## 2.6. Prepare GTEx adult conditional eQTLs

In [15]:
files = list.files("/reference/public/GTEX_v8/GTEx_Analysis_v8_eQTL_independent", full.names = T)

message(paste("# GTEx tissues:", length(unique(files))))

gtex_cond_eqtls = as.data.frame(rbindlist(lapply(files, function(x) 
{
    fread(x, data.table = F) %>% 
        mutate(tissue = unlist(strsplit(unlist(strsplit(x, "/"))[6], "[.]"))[1], 
               analysis = "eqtl_cond",
               chr = gsub("chr", "", unlist(lapply(variant_id, function(var) { unlist(strsplit(var, "_"))[1] }))),
               pos = unlist(lapply(variant_id, function(var) { unlist(strsplit(var, "_"))[2] })),
               id = paste(chr, pos, sep = "_")
               )
    
}))) %>% select(id, tissue, analysis)

head(gtex_cond_eqtls,2)

message(paste("# GTEx variants:", length(unique(gtex_cond_eqtls$id))))

head(gtex_cond_eqtls,2)



# GTEx tissues: 49



Unnamed: 0_level_0,id,tissue,analysis
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,1_64764,Adipose_Subcutaneous,eqtl_cond
2,1_108826,Adipose_Subcutaneous,eqtl_cond


# GTEx variants: 408175



Unnamed: 0_level_0,id,tissue,analysis
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,1_64764,Adipose_Subcutaneous,eqtl_cond
2,1_108826,Adipose_Subcutaneous,eqtl_cond


In [9]:
408175 + 276116

## 2.7 Write all variants (iPSCORE and reference) to a text file for plink

In [16]:
# Aggregate all
all_ref_snps = Reduce(rbind, list(gtex_cond_eqtls, islet_eqtls, islet_caqtls, gtex_eqtls, qtlbase_hqtls, qtlbase_caqtls, finemapped %>% select(id, tissue, analysis)))
all_ref_snps$chr = unlist(lapply(all_ref_snps$id, function(x) { unlist(strsplit(x, "_"))[1] }))

unique(all_ref_snps$chr)

# Only autosomal chr
all_ref_snps = all_ref_snps %>% filter(chr %in% c(1:22))

table(all_ref_snps$chr)
head(all_ref_snps,2)


     1     10     11     12     13     14     15     16     17     18     19 
161772  77522  89292  90351  36831  54748  64334  78645 110423  27548 106276 
     2     20     21     22      3      4      5      6      7      8      9 
126179  43307  26686  47641  97979  78925  84034 108456 103366  66015  68667 

Unnamed: 0_level_0,id,tissue,analysis,chr
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,1_64764,Adipose_Subcutaneous,eqtl_cond,1
2,1_108826,Adipose_Subcutaneous,eqtl_cond,1


In [17]:
# write to text
for (chr in c(1:22))
{
    snps = unique(all_ref_snps[all_ref_snps$chr == chr,]$id)
    message(paste(paste0("chr", chr), length(snps)), appendLF = F)
    
    outfile = paste(getwd(), "analyses/jennifer/ipscore_unique_qtls/scratch", paste0("chr", chr, ".txt"), sep = "/")
    writeLines(snps, outfile, sep = "\n")
    message(paste("Saved:", outfile), appendLF = F)
}

chr1 70938
Saved: /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/scratch/chr1.txt
chr2 61322
Saved: /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/scratch/chr2.txt
chr3 46773
Saved: /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/scratch/chr3.txt
chr4 40047
Saved: /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/scratch/chr4.txt
chr5 39345
Saved: /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/scratch/chr5.txt
chr6 50488
Saved: /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/scratch/chr6.txt
chr7 45867
Saved: /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/scratch/chr7.txt
chr8 32343
Saved: /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_uniqu

## 3. Run plink tags

In [18]:
# delete old results
system("rm analyses/jennifer/ipscore_unique_qtls/tags/*")

for (kb in c(500, 250, 100)[1])
{
    for (r2 in c(0.1, 0.2, 0.7, 0.8))
    {
        for (chr in c(1:22))
        {
            cmd = paste("plink", 
                        "--memory 15000", 
                        "--threads 4", 
                        "--tag-kb", kb, 
                        "--tag-r2", r2, 
                        "--show-tags all", 
                        "--vcf"    , paste(getwd(), "analyses/jennifer/ipscore_unique_qtls/reference", paste0("chr", chr, ".norm.renamed.vcf.gz"), sep = "/"),
                        "--extract", paste(getwd(), "analyses/jennifer/ipscore_unique_qtls/scratch"  , paste0("chr", chr, ".txt"), sep = "/"),
                        "--out"    , paste(getwd(), "analyses/jennifer/ipscore_unique_qtls/tags"     , paste(paste0("chr", chr), paste0(kb, "kb_r", r2), sep = "."), sep = "/"))

            log_out = paste(getwd(), "analyses/jennifer/ipscore_unique_qtls/logs", paste(paste0("chr", chr), paste0(kb, "kb_r", r2), "out", sep = "."), sep = "/")
            log_err = paste(getwd(), "analyses/jennifer/ipscore_unique_qtls/logs", paste(paste0("chr", chr), paste0(kb, "kb_r", r2), "err", sep = "."), sep = "/")

            run_qsub(cmd = cmd, name = paste0("chr", chr), log_err = log_err, log_out = log_out, threads = 4, exec = T)
        }
    }

}




echo "plink --memory 15000 --threads 4 --tag-kb 500 --tag-r2 0.2 --show-tags all --vcf /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/reference/chr1.norm.renamed.vcf.gz --extract /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/scratch/chr1.txt --out /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/tags/chr1.500kb_r0.2" | qsub -N chr1 -V -cwd -pe smp 4 -o /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/logs/chr1.500kb_r0.2.out -e /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/logs/chr1.500kb_r0.2.err

echo "plink --memory 15000 --threads 4 --tag-kb 500 --tag-r2 0.2 --show-tags all --vcf /frazer01/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/ipscore_unique_qtls/reference/chr2.norm.renamed.vcf.gz --extract /frazer01/projects/CARDIPS/analysis/epigeno

## 3. Create list object for easier access downstream 

In [19]:
ref_qtls = list("caqtls" = rbind(qtlbase_caqtls, islet_caqtls), 
                "haqtls" = qtlbase_hqtls, 
                "eqtls" = rbind(gtex_eqtls, gtex_cond_eqtls, islet_eqtls),
                "all" = Reduce(rbind, list(qtlbase_caqtls, qtlbase_hqtls, gtex_eqtls, gtex_cond_eqtls, islet_eqtls, islet_caqtls)))


## 4. Annotate each iPSCORE QTL whether it's in LD with adult

In [23]:
ld_results = list()
tags_list = list()


In [24]:
for (kb in c(500))
{
    for (r2 in c(0.1, 0.2, 0.7, 0.8))
    {
        # 1. Get LD results
        tags = as.data.frame(rbindlist(lapply(c(1:22), function(x)
        {
            file = paste("analyses/jennifer/ipscore_unique_qtls/tags", paste(paste0("chr", x), paste0(kb, "kb_r", r2), "tags.list", sep = "."), sep = "/")
            data = fread(file, data.table = F)
        })))

        # separate tags to rows
        tags = tags %>% tidyr::separate_rows(TAGS, sep = "[|]")

        # Have SNP be in LD with itself
        tags = rbind(tags, tags %>% mutate(TAGS = SNP)) %>% dplyr::filter(TAGS != "NONE")

        tags_list[[paste(kb, r2)]] = tags
        message(paste("Saved:", kb, r2), appendLF = F)
        
        # 2. Annotate whether finemapped snps were tested
        this = finemapped
        this$tested = ifelse(this$id %in% tags$SNP, T, F)

        this = this %>% select(element_cond, tissue, id, tested) %>% distinct()
        
        for (dataset in c("eqtls", "caqtls", "haqtls", "all")[4])
        {
            message(paste(kb, r2, dataset), appendLF = F)
            
            # 3. Get reference SNPs
            ref_snps = ref_qtls[[dataset]]
            ref_snps$position = suppressWarnings(unlist(lapply(ref_snps$id, function(x) { as.numeric(unlist(strsplit(x, "_"))[2]) })))
            ref_snps = ref_snps %>% dplyr::filter(!is.na(position))
            
            # 4. Calculate distance
            # Make bed for iPSCORE finemapped SNPs
            
            a = finemapped %>% 
                       select(chr, position) %>% 
                       mutate(end = position, 
                              chr = as.numeric(chr)) %>% 
                       arrange(chr, position) %>% 
                       mutate(chr = paste0("chr", chr)) %>% 
                       distinct()
            
            b = ref_snps %>% 
                       mutate(chr = as.numeric(unlist(lapply(id, function(x) { unlist(strsplit(x, "_"))[1] })))) %>%
                       dplyr::filter(chr %in% c(1:22)) %>% 
                       select(chr, position) %>% 
                       mutate(end = position) %>% 
                       arrange(chr, position) %>% 
                       mutate(chr = paste0("chr", chr)) %>% 
                       distinct()
            
            # bedtools closes will output an error if chromosomes are not present in both
            a = a %>% filter(chr %in% intersect(a$chr, b$chr))
            b = b %>% filter(chr %in% intersect(a$chr, b$chr))
            
            fwrite(a, "analyses/jennifer/scratch/a.bed", row.names = F, sep = "\t", col.names = F)
            fwrite(b, "analyses/jennifer/scratch/b.bed", row.names = F, sep = "\t", col.names = F)
            
            # Run bedtools intersect
            a = "analyses/jennifer/scratch/a.bed"
            b = "analyses/jennifer/scratch/b.bed"

            message(nrow(fread(a, data.table = F)), appendLF = F)
            message(nrow(fread(b, data.table = F)), appendLF = F)

            cmd = paste("bedtools closest", "-a", a, "-b", b, "-d")
            message(cmd, appendLF = F)
            dist = fread(cmd = cmd, data.table = F) %>% 
                mutate(id = gsub("chr", "", paste(V1, V2, sep = "_")) ) %>% 
                dplyr::rename(dist = V7)
            
            # add distance to table
            this2 = merge(this, dist %>% select(id, dist), by = "id", all.x = T) 

            # 5. Annotate whether in LD
            # SNP are ipscore SNPs, TAGS are reference SNPs
            this_tags = tags %>% 
                dplyr::filter(SNP %in% this$id) %>% 
                dplyr::filter(TAGS %in% ref_snps$id)
            
            this2$has_tag = ifelse(this2$id %in% this_tags$SNP, T, F)

            # annotate
            this2$in_ld = ifelse(this2$tested == T, 
                                 ifelse(this2$has_tag == T, T, F), 
                                 ifelse(this2$dist < 500e3, T, F))
            
            # in case some snps were not in 1000G paanel, annotate those that is in a ref snps
            this2$in_ld = ifelse(this2$id %in% ref_snps$id, T, this2$in_ld)

            this[,paste("in_ld", dataset, sep = "_")] = ifelse(this$element_cond %in% this2[this2$in_ld == T,]$element_cond, T, F)
        } 
        
        ld_results[[paste(kb, r2)]] = this
    }
}
    

Saved: 500 0.2
500 0.2 all
“NAs introduced by coercion”
152904
659483
bedtools closest -a analyses/jennifer/scratch/a.bed -b analyses/jennifer/scratch/b.bed -d


## for variants not in 1000 Genomes, and thus could not calculate LD, test by distance instead. variants within 500kb of an adult QTL were considered in LD with adult. 

In [25]:
snps_1kg = readLines("analyses/jennifer/ipscore_unique_qtls/reference/snps.txt")
head(snps_1kg)

ref_snps = ref_qtls[["all"]]
ref_snps$position = suppressWarnings(unlist(lapply(ref_snps$id, function(x) { as.numeric(unlist(strsplit(x, "_"))[2]) })))
ref_snps = ref_snps %>% dplyr::filter(!is.na(position))
ref_snps$tested = ifelse(ref_snps$id %in% snps_1kg, T, F)

ref_snps = ref_snps %>% filter(tested == F) %>% select(id, position, tested) %>% distinct()
ref_snps$chr = as.numeric(unlist(lapply(ref_snps$id, function(x) { unlist(strsplit(x, "_"))[1] }))) 
ref_snps = ref_snps %>% filter(chr %in% c(1:22))

a = ref_snps %>% select(chr, position) %>% mutate(end = position, chr = paste0("chr", chr)) 
fwrite(a, "analyses/jennifer/scratch/a.bed", row.names = F, sep = "\t", col.names = F)

b = finemapped %>% select(chr, position, element_cond) %>% mutate(end = position, chr = paste0("chr", chr)) %>% select(chr, position, end, element_cond)
fwrite(b, "analyses/jennifer/scratch/b.bed", row.names = F, sep = "\t", col.names = F)

a = "analyses/jennifer/scratch/a.bed"
b = "analyses/jennifer/scratch/b.bed"
cmd = paste("bedtools window", "-a", a, "-b", b, "-w", 500000)
int = fread(cmd = cmd, data.table = F)
head(int,2)

ld_results[["distance"]] = int



“NAs introduced by coercion”


Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<int>,<int>,<chr>
1,chr1,111197723,111197723,chr1,111197723,111197723,cvpc_atac_peak_12953_0
2,chr1,111197723,111197723,chr1,111197723,111197723,ipsc_atac_peak_9924_0


In [None]:
save(ld_results, file = "analyses/jennifer/ipscore_unique_qtls/results.robj")

## Annotate which QTLs were fetal-unique (i.e., EDev-unique)

In [3]:
# load all qtls
qtls = fread("analyses/jennifer/summary_files/all.qtls.no_mhc.txt", data.table = F) %>% filter(new_egene == T & type == 0)

In [4]:
load("analyses/jennifer/ipscore_unique_qtls/results.robj", verbose = T)
names(ld_results)
ld = ld_results[["500 0.2"]] %>% filter(element_cond %in% qtls$element_cond)
ld$in_ld_dist = ifelse(ld$element_cond %in% ld_results[["distance"]]$V7, T, F)
ld$fetal_unique = ifelse(ld$in_ld_all == T | ld$in_ld_dist == T, F, T)
table(ld$fetal_unique)


Loading objects:
  ld_results



 FALSE   TRUE 
133947  15208 

In [5]:
mods = fread("analyses/jennifer/summary_files/all.qtl_modules.H4_0.8.txt", data.table = F)
mods$element_cond = ifelse(mods$element_cond %like% "ENSG", paste(tolower(mods$tissue), mods$element_cond, sep = "_"), mods$element_cond)
mods$fetal_unique = ifelse(mods$element_cond %in% ld[ld$fetal_unique == T,]$element_cond, T, F)

In [6]:
ld = merge(ld, mods[,c("element_cond", "cluster_id")], by = "element_cond", all.x = T)

In [7]:
out = ld %>% select(element_cond, cluster_id, tissue) %>% distinct()
out$analysis = ifelse(out$element_cond %like% "atac", "caqtls", ifelse(out$element_cond %like% "chip", "haqtls", "eqtls"))
out$fetal_unique = ifelse(out$element_cond %in% ld[ld$fetal_unique == T,]$element_cond, T, F)
out$fetal_unique_mod = ifelse(out$cluster_id %in% mods[mods$fetal_unique == F,]$cluster_id, F, T)

table(out$fetal_unique, out$fetal_unique_mod)

       
        FALSE  TRUE
  FALSE 52896     0
  TRUE     82  7532

In [8]:
table(out$analysis, out$fetal_unique)

        
         FALSE  TRUE
  caqtls 25708  4897
  eqtls  18025  1280
  haqtls  9163  1437

In [9]:
out2 = merge(out, qtls[,c("element_cond", "element_id", "type")], by = "element_cond")

In [13]:
table(out2$fetal_unique)


FALSE  TRUE 
52896  7614 

In [14]:
fwrite(out2, "analyses/jennifer/summary_files/fetal_unique.txt", row.names = F, sep = "\t")