In [1]:
source("~/software/notebook_assist/functions.R")
source("~/software/notebook_assist/packages.R")
setwd("/projects/CARDIPS/analysis/epigenome_resource/")

In [2]:
cols = c("Tissue","Element ID","Condition","Element Cond","Element Name","Element Chrom","Element Start",
         "Element End","SNP ID","SNP Chrom","SNP Pos","Ref","Alt","RSID","AF","Beta","SE",
         "P-value","FDR","Tests","Q-value","Cluster ID","Membership","EDev-Unique")


In [3]:
cols=c("element_id","type","new_egene","chrom","pos","ref","alt","id","rsid","beta","se","pval","af","fdr","qval")
original_qtls = function(tissue) {
    eqtls  = fread(paste0("eqtls/",tissue,"/step_4/qtl_by_element/qtl.no_mhc.txt"),sep="\t",data.table=F) %>% filter(new_egene == TRUE)
    caqtls = fread(paste0("caqtls/",tissue,"/step_4/qtl_by_element/qtl.no_mhc.txt"),sep="\t",data.table=F) %>% filter(new_egene == TRUE)
    haqtl_file = paste0("haqtls/",tissue,"/step_4/qtl_by_element/qtl.no_mhc.txt")
    if (file.exists(haqtl_file)) {
        haqtls = fread(haqtl_file,sep="\t",data.table=F) %>% filter(new_egene == TRUE)
        out = rbindlist(list(eqtls[,cols],
                        caqtls[,cols],
                        haqtls[,cols]))
        
        } else{
        out = rbindlist(list(eqtls[,cols],
                        caqtls[,cols]))
        
    }
    out$tissue = tissue
    out$tissue_element = paste(out$tissue, out$element_id,sep="_")
    out$qtl_id = paste(out$tissue,out$element_id, out$type, sep="_")
    out$snp_id = gsub("VAR_","",out$id)
    out$conditional = ifelse(out$type == 0 , "Primary","Conditional")
    return(out)
    
}

ipsc_original = original_qtls("iPSC")
cvpc_original = original_qtls("CVPC")
ppc_original  = original_qtls("PPC")

In [7]:
message(paste(nrow(ipsc_original)+nrow(cvpc_original)+nrow(ppc_original),"QTLs before filtering"))

78852 QTLs before filtering



### load plink BIM file for 1000G EUR population

In [7]:
kgenomes = fread("/projects/CARDIPS/analysis/epigenome_resource/analyses/jennifer/gwas_independent/reference/combined.renamed.bim",
                 sep="\t",
                data.table=F)

In [8]:
kgenomes = kgenomes [ nchar(kgenomes$V5) == 1 &  nchar(kgenomes$V6) == 1 , ]

In [9]:
ipsc_original$in1kg = ifelse(ipsc_original$snp_id %in% kgenomes$V2,  "TRUE","FALSE")


cvpc_original$in1kg = ifelse(cvpc_original$snp_id %in% kgenomes$V2 , "TRUE","FALSE")

ppc_original$in1kg = ifelse(ppc_original$snp_id %in% kgenomes$V2  , "TRUE","FALSE")

### Remove conditional signals with lead variants not in 1000G

In [10]:
ipsc_original2 = ipsc_original[ (ipsc_original$type != 0 & ipsc_original$in1kg == "TRUE") | 
                             (ipsc_original$type == 0),]

cvpc_original2 = cvpc_original[ (cvpc_original$type != 0 & cvpc_original$in1kg == "TRUE") | 
                             (cvpc_original$type == 0),]

ppc_original2 = ppc_original[ (ppc_original$type != 0 & ppc_original$in1kg == "TRUE") | 
                             (ppc_original$type == 0),]
nrow(ipsc_original2)+nrow(cvpc_original2)+nrow(ppc_original2)
78852-78598

### Identify qElements without conditional signals

In [11]:
primaries = rbindlist(list(ipsc_original2[ ipsc_original2$element_id %in% names(table(ipsc_original2$element_id ))[ table(ipsc_original2$element_id ) == 1],],
                          cvpc_original2[ cvpc_original2$element_id %in% names(table(cvpc_original2$element_id ))[ table(cvpc_original2$element_id ) == 1],],
                          ppc_original2[ ppc_original2$element_id %in% names(table(ppc_original2$element_id ))[ table(ppc_original2$element_id ) == 1],]))

nrow(primaries)

### Identify qElements with conditional signals

In [12]:

ipsc_conds = ipsc_original2[ ipsc_original2$element_id %in% names(table(ipsc_original2$element_id))[ table(ipsc_original2$element_id) > 1],]
cvpc_conds = cvpc_original2[ cvpc_original2$element_id %in% names(table(cvpc_original2$element_id))[ table(cvpc_original2$element_id) > 1],]
ppc_conds = ppc_original2[ ppc_original2$element_id %in% names(table(ppc_original2$element_id))[ table(ppc_original2$element_id) > 1],]

conditionals = rbindlist(list(ipsc_conds,cvpc_conds,ppc_conds))
conditionals$conditional = ifelse(conditionals$type == 0, "Primary","Conditional")
table(conditionals$conditional,conditionals$in1kg)
table(conditionals$conditional)
nrow(conditionals)

             
              FALSE  TRUE
  Conditional     0 18292
  Primary       255 13787


Conditional     Primary 
      18292       14042 

In [13]:
prim_not_in1kg = conditionals[ conditionals$tissue_element %in% conditionals$tissue_element[ conditionals$in1kg == "FALSE" & conditionals$type == 0 ],]
table(prim_not_in1kg$conditional)



Conditional     Primary 
        343         255 

In [14]:
primary2dprime = conditionals[ !conditionals$tissue_element %in% prim_not_in1kg$tissue_element, ]
table(primary2dprime$conditional)
table(primary2dprime$in1kg)


Conditional     Primary 
      17949       13787 


 TRUE 
31736 

In [15]:
dprime_pairs = function(qtl_df) {

    qtl_df$snp_id = gsub("VAR_","",qtl_df$id)

    pairs = rbindlist(lapply(unique(qtl_df$tissue_element), function(element){
        element_df = qtl_df[ qtl_df$tissue_element == element , ]
        element_combs = as.data.frame(t(combn(element_df$snp_id,2)))

        element_combs$tissue_element = element
        return(element_combs)
    }))
    pairs$chrom = str_extract(pairs$V1, "^[0-9]+")
    pairs$tissue = unique(qtl_df$tissue)
    return(unique(pairs[ pairs$V1 != pairs$V2,]))
}
# tmp = dprime_pairs(primary2dprime)

ipsc_dprime = dprime_pairs(primary2dprime[ primary2dprime$tissue == "iPSC",])
cvpc_dprime = dprime_pairs(primary2dprime[ primary2dprime$tissue == "CVPC",])
ppc_dprime = dprime_pairs(primary2dprime[ primary2dprime$tissue == "PPC",])

### write pairs for 02.0.calculate_qtl_dprime.sh

In [16]:
conditional_pairs = unique(rbind(rbind(ipsc_dprime,cvpc_dprime),ppc_dprime))
# conditional_pairs$chrom = gsub("_.*","",conditional_pairs$V1)

# fwrite(conditional_pairs,"analyses/tim/ld_modules/scripts/conditional_pairs_v3.txt",
#        sep="\t",row.names=F,col.names=F,quote=F)

In [17]:
conditional_pairs$pair_id = paste(conditional_pairs$V1, conditional_pairs$V2, sep="_")
# conditional_pairs

### Noticed all failed plink runs were due to one of the variants being monomorphic in EUR
### Kept because variant must not be monomorphic in other populations

In [18]:
all_vars = unique(c(conditional_pairs$V1,conditional_pairs$V2))

In [19]:
dprime.fs= list.files("analyses/tim/ld_modules/conditional_dprime_v3",pattern=".txt",full.names=T)
length(dprime.fs)
rem.fs = dprime.fs[ grepl("16_5013101_A_G",dprime.fs)  | grepl("13_24838830_A_G",dprime.fs) | grepl("8_28188378_T_C",dprime.fs) |
           grepl("16_52615396_C_T",dprime.fs) | grepl("15_50299493_G_A",dprime.fs) | grepl("10_132194830_A_C",dprime.fs)|
           grepl("10_98227527_G_A",dprime.fs) | grepl("10_32435467_G_A",dprime.fs) | grepl("18_23962925_A_G",dprime.fs) |
           grepl("13_44035060_G_C",dprime.fs) | grepl("11_22014488_T_C",dprime.fs) | grepl("3_194578850_G_C",dprime.fs) |
           grepl("8_144830982_G_A",dprime.fs) | grepl("17_75185523_A_G",dprime.fs) | grepl("19_8411965_C_T",dprime.fs) | 
                 grepl("5_6831130_A_C",dprime.fs) ]
dprime.fs2 = dprime.fs[ !dprime.fs %in% rem.fs]
length(dprime.fs2)

In [20]:
monomorphic = c("16_5013101_A_G","13_24838830_A_G","8_28188378_T_C","16_52615396_C_T","15_50299493_G_A",
               "10_132194830_A_C","10_98227527_G_A","10_32435467_G_A","18_23962925_A_G","5_6831130_A_C",
               "13_44035060_G_C","11_22014488_T_C","3_194578850_G_C","8_144830982_G_A","17_75185523_A_G",
               "19_8411965_C_T")

In [22]:
read_dprime = rbindlist(lapply(dprime.fs2, function(f) {
    dp = fread(f, sep=" ",data.table=F)
    if (nrow(dp ) > 0 ) {
    dp$compid = gsub("_dprime.txt","",gsub("analyses/tim/ld_modules/conditional_dprime_v3/","",f))
    dp$tissue = str_extract(dp$compid,"^[a-zA-Z]+")
    tissue_tissue = paste(unique(dp$tissue), unique(dp$tissue), sep="_")
    dp$varids = str_extract(dp$compid,"[0-9]+_[0-9]+_[A-Z]_[A-Z]_[0-9]+_[0-9]+_[A-Z]_[A-Z]$")
        
    dp$var1id = str_extract(dp$varids, "^[0-9]+_[0-9]+_[A-Z]_[A-Z]")

    dp$var2id = gsub("^_","",str_extract(dp$varids, "_[0-9]+_[0-9]+_[A-Z]_[A-Z]$"))
    
    dp$element_id = gsub(paste0(tissue_tissue,"_"),"",gsub("_$","",gsub("^_","",
                                                       gsub("[0-9]+_[0-9]+_[A-Z]_[A-Z]_[0-9]+_[0-9]+_[A-Z]_[A-Z]","",dp$compid))))

    dp2 = dp [,c("tissue","element_id","var1id","var2id","V3","V6")]
    names(dp2)[5] = "R2"
    names(dp2)[6] = "Dprime"
    
    return(dp2)}
}))


In [24]:
tested_vars = unique(c(read_dprime$var1id,read_dprime$var2id))
diff_vars = setdiff(all_vars,tested_vars)
not_tested = primary2dprime[ primary2dprime$snp_id %in% diff_vars, ]

not_tested$monomorphic = ifelse(not_tested$snp_id %in% monomorphic, "TRUE","FALSE")
not_tested$monomorphic = ifelse(not_tested$snp_id %in% monomorphic, "TRUE","FALSE")



monomorphic_primary = primary2dprime[ primary2dprime$tissue_element %in%  
                                   not_tested$tissue_element [ not_tested$monomorphic == "TRUE" &
                                                              not_tested$conditional == "Primary"],]
table(monomorphic_primary$conditional)
monomorphic_conditional = not_tested[ not_tested$conditional == "Conditional" & 
                                     !not_tested$tissue_element %in% monomorphic_primary$tissue_element,]
table(monomorphic_conditional$conditional)

length(unique(monomorphic_primary$tissue_element))
length(unique(monomorphic_conditional$tissue_element))


Conditional     Primary 
          5           4 


Conditional 
         12 

In [25]:
read_dprime$tissue_element = paste(read_dprime$tissue, read_dprime$element_id, sep= "_")
read_dprime2 = read_dprime[ !read_dprime$tissue_element %in% c(monomorphic_primary$tissue_element,
                                                             monomorphic_conditional$tissue_element,
                                                              prim_not_in1kg$tissue_element),]


### Aggregated maximum D' for each SNP pair

In [26]:

max_dprime = aggregate(Dprime ~ var1id + var2id + tissue + element_id, data = read_dprime2, FUN = max)

max_r2     = aggregate(R2 ~ var1id + var2id+ tissue + element_id, data = read_dprime2, FUN = max)
max_dprime2 = merge(max_dprime,max_r2, by=c("tissue","element_id","var1id","var2id"))

key1 = rbind(rbind(ipsc_original2[ ,c("element_id","type","snp_id","qtl_id")],
                   cvpc_original2[  ,c("element_id","type","snp_id","qtl_id")]),
             ppc_original2[,c("element_id","type","snp_id","qtl_id")])
colnames(key1) = c("element_id","type1","var1id","qtl1")

key2 = rbind(rbind(ipsc_original2[  ,c("element_id","type","snp_id","qtl_id")],
                   cvpc_original2[ ,c("element_id","type","snp_id","qtl_id")]),
             ppc_original2[ ,c("element_id","type","snp_id","qtl_id")])

colnames(key2) = c("element_id","type2","var2id","qtl2")

### Removed conditional QTLs in high D' and/or LD with the corresponding lead variant

tmp = merge(key2,merge(key1,max_dprime2,by=c("element_id","var1id")),by=c("element_id","var2id"))
tmp2 = tmp[ tmp$type1 == 0 | tmp$type2 == 0,]
tmp3 = tmp2[ tmp2$R2 >= 0.8 | tmp2$Dprime >= 0.8,]

dependent = unique(c(tmp3$qtl1,tmp3$qtl2))
dependent = dependent[ !grepl("0$",dependent)]
length(dependent)

In [27]:
cond_qtls_tested = unique(c(tmp$qtl1,tmp$qtl2))[ !grepl("0$",unique(c(tmp$qtl1,tmp$qtl2)))]
length(cond_qtls_tested)

In [28]:
conds_removed = primary2dprime[ !primary2dprime$qtl_id %in% c(dependent,monomorphic_conditional$qtl_id,monomorphic_primary$qtl_id) ,]
filtered_primaries = conds_removed[ conds_removed$tissue_element %in% 
                                   names(table(conds_removed$tissue_element))[ table(conds_removed$tissue_element) == 1],]

table(filtered_primaries$type)


   0 
4963 

In [29]:
### Remaining QTLs had conditional signals, were regressed for GWAS colocalization


Conditional     Primary 
      10140       13783 

In [30]:
toregress = conds_removed[ !conds_removed$tissue_element %in% filtered_primaries$tissue_element, ]
nrow(toregress)
table(toregress$conditional)


Conditional     Primary 
      10140        8820 

In [31]:
all_qtls = rbindlist(list(ipsc_original,cvpc_original,ppc_original))
all_qtls$conditional = ifelse(all_qtls$type == 0, "Primary","Conditional")
all_qtls$qtl_id = paste(all_qtls$tissue,all_qtls$element_id,all_qtls$type,sep="_")

### Checking that No Primaries are lost

In [46]:
prims = primaries$qtl_id[ primaries$conditional == "Primary"]

reg_prims = toregress$qtl_id[ toregress$conditional == "Primary"]
nonkg_prims = all_qtls$qtl_id[ all_qtls$conditional == "Primary" & all_qtls$in1kg == "FALSE" & !all_qtls$qtl_id %in% prims]
filtered_prims = filtered_primaries$qtl_id[ filtered_primaries$conditional == "Primary"]
mono_prims = monomorphic_primary$qtl_id[ monomorphic_primary$conditional == "Primary"]

length(reg_prims) + length(nonkg_prims) + length(filtered_prims)+ length(mono_prims)+ length(prims)

## Tracking filtered conditionals

In [47]:
nokg_conds = all_qtls$qtl_id[ all_qtls$conditional == "Conditional" & all_qtls$in1kg == "FALSE" ]

nonkg_prim_tissele = gsub("_[0-3]+$","",nonkg_prims)
nonkg_prims_conds = all_qtls$qtl_id[ !all_qtls$qtl_id %in% nokg_conds & 
                                    all_qtls$conditional == "Conditional" & all_qtls$tissue_element %in% nonkg_prim_tissele]
length(nonkg_prims_conds)
mono_conds = monomorphic_conditional$qtl_id[ monomorphic_conditional$monomorphic == "TRUE"]
mono_prim_conds = monomorphic_primary$qtl_id[ monomorphic_primary$conditional == "Conditional"]

dprime_conds = dependent[ !dependent %in% c(nokg_conds,mono_conds,mono_prim_conds,nonkg_prims_conds) ]



length(dprime_conds)+length(mono_prim_conds) +length(mono_conds) + length(nonkg_prims_conds) + length(nokg_conds)
intersect(mono_conds, mono_prim_conds)
intersect(mono_conds, nokg_conds)
intersect(mono_conds, dprime_conds)
intersect(mono_conds, nonkg_prims_conds)

intersect(mono_prim_conds, nokg_conds)
intersect(mono_prim_conds, dprime_conds)
intersect(mono_prim_conds, nonkg_prims_conds)

intersect(nokg_conds, dprime_conds)
intersect(nokg_conds, nonkg_prims_conds)

intersect(nonkg_prims_conds, dprime_conds)



In [52]:
filtered_qtls = all_qtls[ !all_qtls$qtl_id %in% c(dprime_conds,mono_prim_conds,mono_conds,nonkg_prims_conds,nokg_conds) , ]

filtered_qtls2 = rbindlist(lapply(unique(filtered_qtls$tissue_element), function(te_id){
    tiss_elem = filtered_qtls[ filtered_qtls$tissue_element == te_id, ]
    if (nrow(tiss_elem) == 1) {
        tiss_elem$new_condition = 0
    } else {
        tiss_elem = tiss_elem %>% arrange(type)
        qtl_seq = seq(0,nrow(tiss_elem)-1)
        tiss_elem$new_condition = qtl_seq
    }
    return(tiss_elem)
}))
