# MRAID For QTLs
This notebook is to perform MRAID analysis for different targets with QTL data


In [1]:
wc -l ~/Work/MR/2023.4_MR/output/metabolics/Metabolon_Bile_Biocrate_targets_df.csv|cut -f1 -d ' '

841


In [None]:
nohup sos run ~/codes/xqtl-pipeline/pipeline/MRAID_QTL.ipynb/ mraid_qtl    \
    --n 840 \
    --targets_df "~/Work/MR/2023.4_MR/output/metabolics/Metabolon_Bile_Biocrate_targets_df.csv"  \
    --con "metabolics_pval_beta_0.001"    \
    --qtl "metaQTL"    \
    --p_cut 0.001 \
    --pval_beta 1 \
    -s build -J 200 -q csg -c ~/test/csg.yml   &> mraid_meta_1.2.log &

In [2]:
wc -l /mnt/vast/hpc/csg/rf2872/Work/MR/2023.4_MR/output/ADlist_lit/Causal_AD_genes_from_literature_targets_df.csv|cut -f1 -d ' '

19


In [None]:
for i in 0.01 0.001 0.0001 0.00001;
do
    nohup sos run ~/codes/xqtl-pipeline/pipeline/MRAID_QTL.ipynb/ mraid_qtl \
        --n 19 \
        --targets_df "/mnt/vast/hpc/csg/rf2872/Work/MR/2023.4_MR/output/ADlist_lit/Causal_AD_genes_from_literature_targets_df.csv" \
        --con ADlist_lit_eQTL_GWAS_${i} \
        --qtl "eQTL" \
        --p_cut ${i} \
        --pval_beta 0 \
        -s build -J 200 -q csg -c ~/test/csg.yml &> mraid_eQTL_${i}.log &
done


In [5]:
for i in 0.01 0.001 0.0001 0.00001;
do
    n=$(($(wc -l < /mnt/vast/hpc/csg/rf2872/Work/MR/2023.4_MR/output/ADlist_lit/Causal_AD_genes_from_literature_targets_df.csv) - 1))
    nohup sos run ~/codes/xqtl-pipeline/pipeline/MRAID_QTL.ipynb/ mraid_qtl \
        --n $n \
        --targets_df "/mnt/vast/hpc/csg/rf2872/Work/MR/2023.4_MR/output/ADlist_lit/Causal_AD_genes_from_literature_targets_df.csv" \
        --con ADlist_lit_eQTL_GWAS_${i} \
        --qtl "eQTL" \
        --p_cut ${i} \
        --pval_beta 0 \
        -s build -J 200 -q csg -c ~/test/csg.yml &> mraid_eQTL_${i}.log &
done


In [None]:
for i in 0.01 0.001 0.0001 0.00001;
do
    n=$(($(wc -l < /mnt/vast/hpc/csg/rf2872/Work/MR/2023.4_MR/output/ADlist_lit/Causal_AD_genes_from_literature_targets_df.csv) - 1))
    nohup sos run ~/codes/xqtl-pipeline/pipeline/MRAID_QTL.ipynb/ mraid_qtl \
        --n $n \
        --targets_df "/mnt/vast/hpc/csg/rf2872/Work/MR/2023.4_MR/output/ADlist_lit/Causal_AD_genes_from_literature_targets_df.csv" \
        --con ADlist_lit_pQTL_GWAS_${i} \
        --qtl "pQTL" \
        --p_cut ${i} \
        --pval_beta 0 \
        -s build -J 200 -q csg -c ~/test/csg.yml &> mraid_eQTL_${i}.log &
done


## Global parameters

In [None]:
[global]
import os
# Work directory & output directory
parameter: cwd = path('./')
# The filename prefix for output data
parameter: name="test"
parameter: job_size = 1
parameter: container = ''
parameter: table_name = ""
parameter: con = str
parameter: qtl = str
parameter: p_cut = 0.001
parameter: targets_df = path
parameter: per_chunk = 10
##  conditions can be excluded if needs arise. If nothing to exclude keep the default 0
parameter: datadir = ""
import pandas as pd
#parameter: analysis_units = path
parameter: pval_beta = 0
# handle N = per_chunk data-set in one job
#regions = [x.replace("\"","").strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('target')]


In [None]:
# perform mraid analysis with QTL data
[mraid_qtl_1]
import os
import pandas as pd
#parameter: analysis_units = path
#regions = [x.replace("\"","").strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('target')]
parameter: n = 10
n  = [x+1 for x in range(n)]
input: for_each = 'n'
output: f"{cwd:a}/{con}/{qtl}/sig_{p_cut}/res_mraid_{con}_p_cut_{p_cut}_{_n}.csv"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '80G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    library(data.table)
    library(tidyverse)
    library(UpSetR)
    library(dplyr)
    library(stringr)
    library("data.table")
    library(stringr)
    library(RcppCNPy)
    library(arrow)
    library(reticulate)
    library(MRAID)    
    find_SNP_from_sum<-function(ref,refpath,SNP,ID="variant",full_chr=FALSE,snp.df=NULL,ref_with_fullpath=NULL,IDsep=":",suffix='.norminal.cis_long_table.txt',IDsep_in_sum="_"){
        chr<-unique(stringr::str_split(SNP,IDsep,simplify=T)[,1])
        if(isTRUE(full_chr))chr.use<-chr else chr.use<-gsub("chr","",chr)
        for(i in 1:length(chr.use))try({
            c<-chr.use[i]
            use.ref<-ref[grep(paste0("[.]",c,suffix),ref)]
            if(isTRUE(ref_with_fullpath)){
                tmp.f <- data.table::fread(use.ref[i],sep="\t",header=T)} else {
                tmp.f <- data.table::fread(paste0(refpath,"/",use.ref),sep="\t",header=T)
            }
            if(stringr::str_split_fixed(tmp.f[[ID]],IDsep,n=2)[1,1]!=chr[i]){
                tmp.f[[ID]]<-sub(IDsep_in_sum,IDsep,tmp.f[[ID]])
            }
            tmp.f<-tmp.f[tmp.f[[ID]] %in% SNP,]
            snp.df<-rbind(snp.df,tmp.f)    
        })
        return(snp.df)
    }
        # mraid function
        mraid <- function(target,path, qtl, res = NULL, p_cut = 1, targets_df, con = "ADGWAS",outcome="GWAS",pval_beta=FALSE) {
        chr=targets_df[targets_df$targets == target, "chromosome"]%>%unique()
            if (qtl == "eQTL") {
                exp.chr <- fread(paste0("~/Work/MR/2023.4_MR/data/sumtats/eQTL/dlpfc_batch_all.rnaseqc.low_expression_filtered.outlier_removed.tmm.expression.bed.processed_phenotype.per_chrom_dlpfc_batch_all.rnaseqc.ROSMAP_covariates.ROSMAP_NIA_WGS.pca.PEER.txt.", chr, ".norminal.cis_long_table.txt"))
            }
            if (qtl == "pQTL") {
                exp.chr <- fread(paste0("~/Work/MR/2023.4_MR/data/sumtats/pQTL/rf_standard/pheno_recipe_pheno.rosmap_cov.ROSMAP_NIA_WGS.leftnorm.filtered.filtered.prune.pca.resid.PEER.", chr, ".norminal.cis_long_table.txt"))
                # there are so many duplicated rows in exp matrix
                exp.chr <- exp.chr[-which(duplicated(exp.chr)), ]
                # in pQTL files, the snps are chrx_tacg_... but in most files should be chrx:tacg_...
                exp.chr$variant <- sub("_", ":", exp.chr$variant)
            }
            if (qtl == "mQTL") {
                exp.chr <- fread(paste0("~/Work/MR/2023.4_MR/data/sumtats/mQTL/ROSMAP_arrayMethylation_covariates.sesame.methyl.beta.sample_matched.bed_BMIQ.bed.filter_na.bed.softImputed.bed.processed_phenotype.per_chrom_methy.peer.pca.chr", chr, ".norminal.cis_long_table.txt"))
                exp.chr <- exp.chr %>% rename("chromosome" = "chrom", "position" = "pos")
                exp.chr$chromosome <- gsub("chr", "", exp.chr$chromosome)
            }
            if (qtl == "haQTL") {
                exp.chr <- fread(paste0("~/Work/MR/2023.4_MR/data/sumtats/haQTL/h3k9ac_bed_recipe_h3k9ac_whole.k9_cov.xqtl_protocol_data.filtered.related.filtered.extracted.pca.projected.resid.PEER.merged.", chr, ".norminal.cis_long_table.txt"))
                exp.chr <- exp.chr %>% rename("chromosome" = "chrom", "position" = "pos")
                exp.chr$chromosome <- gsub("chr", "", exp.chr$chromosome)
            }
            if (qtl == "metaQTL") {
                exp.chr <- fread(list.files(paste0("~/Work/MR/2023.4_MR/data/sumtats/metaQTL/", chr,"/rf/"),"_rf.norminal.trans_long_table.txt$",full.names = T))
                exp.chr <- exp.chr %>% rename("chromosome" = "chrom", "position" = "pos")
                exp.chr$chromosome <- gsub("chr", "", exp.chr$chromosome)
                exp.chr$position <- str_split(exp.chr$position,":",simplify = T)[,2]%>%as.numeric()
            }
                ## 2. prapare the input for MR
                ### 2.1 exposure GWAS sumstats
                # get qtl Data
                exp <- exp.chr[molecular_trait_id == target]
                snps_ana_gene <- exp$variant
                if(pval_beta==1){exp <- exp[pval_beta < p_cut, ]} else exp <- exp[pvalue < p_cut, ]
                snps_asso_gene <- exp$variant
                res <- data.table(target = target, n_snps_ana = length(snps_ana_gene), p_cut = p_cut, n_snps_asso = length(snps_asso_gene))
                message(length(snps_asso_gene), " SNPs associated with ", target)
                if (length(snps_asso_gene) > 0) {
                    ### 2.2 outcome GWAS sumstats
                    # get snps' GWAS data
                    if (outcome == "GWAS") {
                        path <- "/home/rf2872/Work/MR/2023.4_MR/data/sumtats/GWAS/"
                        out <- find_SNP_from_sum(ref = list.files(path), refpath = path, SNP = snps_asso_gene, suffix = ".sumstat.tsv", full_chr = T, IDsep_in_sum = "_")
                    } else {
                        out.path <- paste0("/home/rf2872/Work/MR/2023.4_MR/data/sumtats/",outcome)
                        out <- find_SNP_from_sum(ref = list.files(out.path), refpath = out.path, SNP = snps_asso_gene, full_chr = F, IDsep_in_sum = "_")
                        int_snps <- intersect(snps_asso_gene, out$variant)
                    }
                    int_snps <- intersect(snps_asso_gene, out$variant)
                    if(length(int_snps)>0){
                    ### 2.3/2.4 exposure LD matrix
                    # since the LD files are saved separately accrodding to their postions, I'd like to use bedtools to map the SNPs to LD files
                    LD <- read.table("/mnt/vast/hpc/csg/molecular_phenotype_calling/LD/output/1300_hg38_EUR_LD_blocks_LD/ROSMAP_NIA_WGS.leftnorm.filtered.filtered.ld.list", header = F, sep = "\t")
                    LD.bed <- str_split(LD$V1, "_", simplify = T) %>% cbind(., LD)
                    
                    # if(qtl == "eQTL" | qtl == "pQTL"){
                    exp_LD.bed <- data.frame(chrome = paste0("chr", exp$chromosome), start = exp$position, end = (exp$position + nchar(exp$ref) - 1))
                    # }
                    # if(qtl == "mQTL"){
                    # exp_LD.bed <- data.frame(chrome = exp$chrom, start = exp$pos, end = (exp$pos + nchar(exp$ref) - 1))
                    # }
                    options(bedtools.path = "/home/rf2872/software/bedtools2/bin/")
                    LD.files <- bedtoolsr::bt.intersect(a = LD.bed, b = exp_LD.bed)
                      
                    # load python tools to read npz files, but there would be an error for npz1$f[["arr_1"]], so I need to read the bim file to locate the snps. (Hao says they are in sam order)
                    np <- import("numpy")
                    LD_files <- unique(LD.files$V5)
                    
                    # some snp sets could locate in more than 1 ld files. so I have to read those files separately and combine them (I don't know which function can do this easily, so I just write a new one)
                    LD.list <- list()
                    for (f in 1:length(LD_files)) {
                        npz1 <- np$load(LD_files[f])
                        ld.mtx <- npz1$f[["arr_0"]]
                        ld.snps <- stringr::str_split(LD_files[f], "[.]", simplify = T) %>%
                            .[, -c(length(.), (length(.) - 1))] %>%
                            paste(., collapse = ".") %>%
                            paste0(., ".bim") %>%
                            read.table(.) %>%
                            .[, 2]
                        colnames(ld.mtx) <- rownames(ld.mtx) <- ld.snps
                        LD.list[[f]] <- as.data.frame(ld.mtx)
                        realsnps.tmp <- intersect(colnames(LD.list[[f]]), int_snps)
                        if(length(realsnps.tmp)>0) {
                    LD.list[[f]] <- LD.list[[f]][realsnps.tmp, realsnps.tmp]%>%as.data.frame()
                    colnames(LD.list[[f]]) <- rownames(LD.list[[f]]) <- realsnps.tmp} else LD.list[[f]]<-NA 
                        
                    }      
                    LD.list[sapply(LD.list,function(x) all(is.na(x)))] <- NULL
                    
                    if(length(LD.list)>0){
                    u <- LD.list[[1]]
                    if (length(LD.list) > 1) {
                        for (x in 2:length(LD.list)) {
                            t.col <- ncol(u) + ncol(LD.list[[x]])
                            u.n <- matrix(ncol = t.col, nrow = t.col, 0)
                            u.n[1:ncol(u), 1:ncol(u)] <- as.matrix(u)
                            u.n[(ncol(u) + 1):ncol(u.n), (ncol(u) + 1):ncol(u.n)] <- as.matrix(LD.list[[x]])
                            colnames(u.n) <- rownames(u.n) <- c(colnames(u), colnames(LD.list[[x]]))
                            u <- u.n
                        }
                    }
                      
                    # the snps in LD matrix and sumstats files are not exactly same, so I need to get the overlapped ones
                    realsnps <- intersect(colnames(u), int_snps)
                    LD.out.m <- LD.exp.m <- as.matrix(u[realsnps, realsnps])
                    res[, n_snps_real := length(realsnps)]
                    message(length(realsnps), " SNPs in ", target, " analyzed in MRAID")
                      
                    ### 2.5 exposure sample size
                    # from [DLPFC ROSMAP](https://github.com/cumc/fungen-xqtl-analysis/blob/main/data/descriptor/qtl/ROSMAP_DLPFC_expression_qtl.md) (The number on this page is wrong on 4/11/2023, it should be ~950, let's set 900 for now)
                    samplen1 <- 900
                    ### 2.6 outcome sample size
                    # from [Bellenguez GWAS paper](https://pubmed.ncbi.nlm.nih.gov/35379992/)(111,326 AD cases and 677,663 controls)
                    samplen2 <- 788989
                      
                    ## 3. RUN MRAID
                    exp.real <- exp[exp$variant %in% realsnps, ]
                    exp.z <- as.numeric(exp.real$beta) / as.numeric(exp.real$se)
                      
                    out.real <- out[out$variant %in% realsnps, ]
                    out.z <- as.numeric(out.real$beta) / as.numeric(out.real$se)
                      
                    res[, n_snps_real := length(realsnps)]
                        
                    # you can check the parameters info for MRAID  in [here](https://github.com/yuanzhongshang/MRAID/blob/main/man/MRAID.Rd)
                    message(target, " MRAID analysis...")
                      
                    # running MRAID with a big LD matrix would cost so much time, I would like to document the time used
                    start.time <- Sys.time()
                    result <- tryCatch(MRAID(exp.z, out.z, LD.exp.m, LD.out.m, samplen1, samplen2,
                        Gibbsnumber = 1000, burninproportion = 0.2, pi_beta_shape = 0.5,
                        pi_beta_scale = 4.5, pi_c_shape = 0.5, pi_c_scale = 9.5, pi_1_shape = 0.5, pi_1_scale = 1.5, pi_0_shape = 0.05, pi_0_scale = 9.95
                    ), error = function(x) {
                        message("Can't get a valid LD matrix for ", target)
                        return(NA)
                    })
                    end.time <- Sys.time()
                    } else {
                    res[, n_snps_real := 0]
                    result <- NA
                }
                    } else {
                    res[, n_snps_real := 0]
                    result <- NA
                }
                } else {
                    res[, n_snps_real := 0]
                    result <- NA
                }
                
                if (is.na(result[[1]])) {
                    result <- matrix(nrow = 1, ncol = 7, NA)
                    colnames(result) <- c("causal_effect", "causal_pvalue", "correlated_pleiotropy_effect", "sigmabeta", "sigmaeta", "sigma_error_1", "sigma_error_2")
                }
                
                result <- as.data.table(result)[, target := target]
    
                res <- tryCatch(merge(res, result, by = "target"), error = function(x) {
                    return(res)
                })
                # I am afraid of receiveing error in final result, so I'd like to save the result per target....
                #fwrite(res, paste0(outpath, "res_mraid_", qtl, "_", con, "_p_cut_", p_cut, "_", target, ".csv.gz"))
                return(res)
          }
      #out<-data.frame()
      #for (f in c(${_input:r,})) {
       targets_df<-data.table::fread("${targets_df}")
       target=targets_df$targets[${_n}+1]
       eqtl.mr <- mraid(target=target,path=paste0("${cwd:a}","/"),qtl="${qtl}",p_cut=${p_cut},con="${con}",targets_df=targets_df,pval_beta=${pval_beta})
       #fwrite(eqtl.mr, paste0("${_output:d}","res_mraid_","${con}","_p_cut_","{p_cut}",f,".csv"))
       #out<-rbind(out,eqtl.mr)
      #}
    fwrite(eqtl.mr, ${_output:r})



In [None]:
[mraid_qtl_2]
input: group_by = "all"
output: f"{cwd}/{con}/{qtl}/sig_{p_cut}/all_res_mraid_{con}_p_cut_{p_cut}.csv"
task: trunk_workers = 1, walltime = '2h', trunk_size = 1, mem = '16G', cores = 1, tags = f'{_output:bn}'

  
bash: expand ='${ }', stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
     cd ${_input[0]:d}
     echo 'target,n_snps_ana,p_cut,n_snps_asso,n_snps_real,causal_effect,causal_pvalue,correlated_pleiotropy_effect,sigmabeta,sigmaeta,sigma_error_1,sigma_error_2' > ${_output:r}
     cat *.csv|grep -v n_snps_ana >> ${_output:r}
