In [1]:
source("~/software/notebook_assist/functions.R")
source("~/software/notebook_assist/packages.R")
setwd("/projects/CARDIPS/analysis/epigenome_resource/")
suppressPackageStartupMessages(library(igraph))

In [2]:
theme_bw2 = theme_bw() +
    theme(
        axis.text = element_text(size = 10),
        strip.text = element_text(size = 10),
        axis.title = element_text(size = 10),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        legend.position = "top"
    )

In [3]:
primary_qtls = fread("analyses/tim/ld_modules/scripts/filtered_qtls_09042024.txt",sep="\t",data.table=F) %>% filter(type == 0)
primary_qtls$snpid = gsub("VAR_","",primary_qtls$id)
primary_qtls$chrom = as.numeric(str_extract(primary_qtls$snpid, "^[0-9]+"))


### Write lead SNPs to files by chromosome
### run 04.0.calculate_lead_ld_for_modules.sh

In [6]:
for (i in 1:22) {
    chr_snps = primary_qtls[ primary_qtls$chrom == i,]
    writeLines(chr_snps$snpid, paste0("analyses/tim/ld_modules/lead_variant_ld/chr",i,".snps"))
}

### Find pairs of lead variants that are in LD >0.8 and within 100kb

In [5]:
ld_lead = as.data.frame(rbindlist(lapply(1:22, function(x){
    ld = fread(paste0("analyses/tim/ld_modules/lead_variant_ld/chr",x,"_ld.ld"),sep=" ",data.table=F)
    inld = ld[ ld$R2 >= 0.8 , ]
    inld$distance = abs(as.numeric(gsub("_.*","",gsub("^[0-9]+_","",inld$SNP_A))) - as.numeric(gsub("_.*","",gsub("^[0-9]+_","",inld$SNP_B))))

    return(inld[ inld$distance <= 100000,])
})))

In [6]:
primary_qtls$qtl_id = paste( primary_qtls$element_id,primary_qtls$type, sep="_")

key1 = primary_qtls[,c("qtl_id","snpid","tissue","element_id")]
colnames(key1) = c("QTL1_ID","SNP_A","Tissue1", "Element1")
key2 = primary_qtls[,c("qtl_id","snpid","tissue","element_id")]
colnames(key2) = c("QTL2_ID","SNP_B","Tissue2", "Element2")

ld_lead2 = merge(merge(ld_lead,key1),key2, by="SNP_B") %>% 
                filter(Tissue1 == Tissue2 & Element1 != Element2 )


### Account for qElements with same lead variant

In [8]:
exact_match = function(Tissue){
    df = primary_qtls[ primary_qtls$tissue == Tissue,]
    em = df[ df$id %in% names(table(df$id))[ table(df$id) > 1],]
    by_variant = rbindlist(lapply(unique(em$id), function(y){
        bv = em[ em$id == y, ]
        bv_edges = as.data.frame(t(combn(bv$qtl_id, 2)))
        colnames(bv_edges) = c("QTL1_ID","QTL2_ID")
        bv_edges$R2 = 1
        bv_edges$distance = 0
        return(bv_edges)
    }))
    return(by_variant)
}


ipsc_exact = exact_match("iPSC")
cvpc_exact = exact_match("CVPC")
ppc_exact = exact_match("PPC")

### make edges and vertices for network construction

In [9]:
process_edges = function( tissue, same_edges) {
    edges = ld_lead2[ ld_lead2$Tissue1 == tissue,c("QTL1_ID","QTL2_ID","R2","distance")]
    edges$QTL1_ID = gsub(paste0(tissue,"_"),"",edges$QTL1_ID)
    edges$QTL2_ID = gsub(paste0(tissue,"_"),"",edges$QTL2_ID)
    edges2 = rbind(edges,same_edges)
    return(as.data.frame(edges2))
}

process_vertices = function(Tissue) {
    verts = primary_qtls %>% filter(tissue == Tissue) %>% select(qtl_id, snpid)
    verts$qtl_id = gsub(paste0(Tissue,"_"),"",verts$qtl_id)
    return(as.data.frame(verts))
}

ipsc_verts = process_vertices("iPSC")
ipsc_edges = process_edges("iPSC", ipsc_exact)

cvpc_verts = process_vertices("CVPC")
cvpc_edges = process_edges("CVPC", cvpc_exact)

ppc_verts = process_vertices("PPC")
ppc_edges = process_edges("PPC", ppc_exact)

In [30]:
nrow(ipsc_edges)
nrow(cvpc_edges)
nrow(ppc_edges)


### Construct networks and cluster to identify complex QTLs by module membership

In [11]:
process_clusters <- function(net, clusters) {
    all_degrees <- degree(net)
    cl <- data.frame(id = names(membership(clusters)))
    cl$cluster <- membership(clusters)
    cl$degree <- all_degrees[cl$id]
    return(cl)
}

make_networks = function(summaries, qtl_df) {
    edges = summaries[ , c("QTL1_ID","QTL2_ID","R2")]
    colnames(edges) = c("id1","id2","weight")
    
    verts = data.frame(id = intersect(qtl_df$qtl_id, c(edges$id1,edges$id2)))
    verts$datatype = ifelse(grepl("^E",verts$id) == "TRUE","RNA",
                            ifelse(grepl("atac",verts$id) == "TRUE","ATAC","ChIP"))
    
    net = graph_from_data_frame(edges, directed = FALSE, vertices = verts)
    clusterlouvain = cluster_louvain(net)
    
    #new 
    all_degrees <- degree(net)
    cl <- data.frame(id = names(membership(clusterlouvain)))
    cl$cluster_id <- membership(clusterlouvain)
    cl$degree <- all_degrees[cl$id]
    return(cl)
}


ipsc_clusters = make_networks(ipsc_edges, ipsc_verts) %>% group_by(cluster_id)%>% mutate(n_members = length(id))
cvpc_clusters = make_networks(cvpc_edges, cvpc_verts) %>% group_by(cluster_id) %>% mutate(n_members = length(id))
ppc_clusters = make_networks(ppc_edges, ppc_verts) %>% group_by(cluster_id) %>% mutate(n_members = length(id))


new_ids = function(df, qtl_df) {
    df2 = df[ order(-df$n_members), ]
    df3 = merge(df2, 
                data.frame(cluster_id = unique(df2$cluster_id), 
                           new_id = seq(1,length(unique(df2$cluster_id)),1)))
    df3$cluster_id = NULL
    df3$extended_id = NULL
    add_start = max(df3$new_id) + 1
    singletons = setdiff(qtl_df$qtl_id, df3$id)
    
    add_on = data.frame(id = singletons, degree = 0, n_members = 1,
                        new_id = seq(add_start,add_start+length(singletons)-1,1))
    df4 = rbind(df3, add_on)
    df5 = df4[ order(df4$new_id, df4$degree),]
    colnames(df5) = c("element_cond","degree","n_members","cluster_id")
    df5$element_id = gsub("_[0-3]$","",df5$element_cond)
    df6 = df5[, c("element_id","element_cond","cluster_id","degree","n_members")] %>% 
                group_by(cluster_id, element_id) %>% 
                mutate(n_occur = length(element_id))
    return(df6)
}


ipsc_clusters2 = new_ids(ipsc_clusters, ipsc_verts)
cvpc_clusters2 = new_ids(cvpc_clusters, cvpc_verts)
ppc_clusters2 = new_ids(ppc_clusters, ppc_verts)

In [32]:
clean_modules = function(tissue, df) {
    df$Tissue = tissue
    df$Cluster_ID = paste(df$Tissue, df$cluster_id,sep="_")
    df$Element_ID = df$element_id
    df$qElement_Type = ifelse(grepl("atac",df$element_id), "caPeak",
                                    ifelse(grepl("chip",df$element_id), "haPeak","eGene"))
    df$Complexity   = ifelse(df$n_members > 1, "Complex","Singleton")
    df$qElements_inQTL = df$n_members
    
    return(df[,7:12])
}

all_modules = rbind(rbind(clean_modules("iPSC",ipsc_clusters2),
                          clean_modules("CVPC",cvpc_clusters2)),
                    clean_modules("PPC",ppc_clusters2))
length(unique(all_modules$Cluster_ID[ all_modules$qElements_inQTL >= 5]))

In [44]:
### this was updated to identify nominated QTL, see below
fwrite(all_modules,"analyses/tim/ld_modules/modules/all_modules_090424.txt",sep="\t",row.names=F,quote=F)

In [83]:
all_modules = fread("analyses/tim/ld_modules/modules/all_modules_090424.txt",sep="\t",data.table=F)
all_modules$tissue_element = paste(all_modules$Tissue, all_modules$Element_ID, sep="_")


In [88]:
all_modules2 = merge(all_modules, primary_qtls[,c("tissue_element","in1kg")])
table(all_modules2$in1kg)


FALSE  TRUE 
  776 59530 

### Nominate random signal from each complex QTL for GWAS colocalization

In [110]:
all_modules3 = rbindlist(lapply(unique(all_modules2$Cluster_ID), function(cl_id){
    tmp = all_modules2 [ all_modules2$Cluster_ID == cl_id,]
    if (nrow(tmp[ tmp$in1kg == "TRUE",] >= 1)) {
        out = tmp[ tmp$in1kg == "TRUE",]
        out = out[sample(nrow(out), 1), ]
    } else {
        out = tmp [sample(nrow(tmp), 1), ]  
    }
                          
    return(out)
}))


In [113]:
all_modules2$Nominated = ifelse(all_modules2$tissue_element %in% all_modules3$tissue_element, "TRUE","FALSE")

In [115]:
all_modules4 = rbindlist(lapply(unique(all_modules2$Cluster_ID), function(cl_id){
    tmp = all_modules2 [ all_modules2$Cluster_ID == cl_id,]
    tmp$qtl_type = ifelse(tmp$qElement_Type == "caPeak","caQTL",
                         ifelse(tmp$qElement_Type == "haPeak","haQTL","eQTL"))
    tmp$qtl_combo = paste(sort(unique(tmp$qtl_type)), collapse = "-")
    return(tmp)
    }))

In [121]:
### Used in manuscript, cleaned up in supplemental table
fwrite(all_modules4,"analyses/tim/ld_modules/modules/all_modules_091124.txt",sep="\t",row.names=F,quote=F)