# Simulate data for fine mapping and colocalization 

The data we are using now is N3finemapping from SuSiER package.

We generate synthetic outcomes $Y^p$ and $Y^e$ under the multipl-regression model $Y = X b + e,\ e \sim N_n(0, \sigma^2 I_n)$, with assumptions on $b$ specified by two parameters: the LD of the two variants $r$, the proportion of variance in $y$ explained by $X$. Given $r = c(0.1, 0.3, 0.5, 0.7, 0.9)$ and $\phi = c(0.05, 0.1, 0.2, 0.5, 0.7)$, we simulate $Y^p$ and $Y^e$ as follows.

1. we simulate two true causal variants in a specific LD $r$ for both $Y^p$ and $Y^e$, where there is one common causal variant (colocalized by both $Y^p$ and $Y^e$) and one each for the unique true causal variant. We first random generate one colocalized causal variant $X_c$ and then random select other two variants $X_p$ and $X_e$ such that $LD \approx r \pm 0.01$.
2. We consider the fixed effects for the true causal variants. We set up the true effect sizes of these three causal variants are $b^p_{(c, p)} = (1, 1)$ and $b^e_{(c,e)} = (1, 1)$ because b is not that important with $\phi$ in our prarameter. (Even when b is large enough, if heritability is low, then its ability to explain variance is still low) Then, for all $j \not\in \mathcal{S}$, set $b^p_j=0$ and $b^e_j=0$. To generate the relative distribution of effect sizes, we put the signs of the $|b_j|$.
3. Set $\sigma^2$ to achieve the desired proportion of variance explained $\phi$; specific, we solve for $\sigma^2$ in $\phi = \dfrac{var(Xb)}{\sigma^2 + var(Xb)}$ by replacing $X$ to $X_e$ and $X_p$ separately.
4. For each $i = 1, \cdots, n$, draw $Y^e_i \sim N(X_e b^e, \sigma_e^2)$ and $Y^p_i \sim N(X_p b^p, \sigma_p^2)$.

In [None]:
[simulation]
parameter: size = 10
parameter: phi = [0.3, 0.3]
parameter: beta = [1.0,1.0,1.0,1.0]
parameter: r = [0.1, 0.1]
parameter: sign_b = ["1", "1"]
parameter: cwd = path("output")
parameter: job_size = 1
parameter: walltime = "5h"
parameter: mem = "16G"
parameter: numThreads = 20
parameter: container = ""
output: f'{cwd:a}/simulation_data/samplesize_{size}_heri_{phi[0]}_{phi[1]}_LD_{r[0]}_{r[1]}.simulation.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    # for LD threshold r, count the number of covariance in list that have difference with r less than 0.01
    get_pool <- function(ls, r){
    ls = abs(ls)
    len = length( which(abs(ls - r) < 0.01) ) 
    return(len)
    }

    simu_data <- function(size, Xe, Xp, r = c(0.1, 0.1), phi = c(0.2, 0.2), beta = c(1,1,1,1), sign_b = c(1, 1)){
      if (ncol(Xe) == ncol(Xp)){
        P = ncol(Xe)
      } else {
        stop("Please restrict the genotype for the same P SNPs")
      }
      LDe = cor(Xe)
      LDp = cor(Xp)

      # set up the relative distribution of true effect sizes
      if (sign_b[1] > 0 & sign_b[2] > 0){
        be_c = c(1,1) * c(beta[1], beta[2])
        bp_c = c(1,1) * c(beta[3], beta[4])
      } else if (sign_b[1] > 0  & sign_b[2] < 0){
        be_c = c(1,1) * c(beta[1], beta[2])
        bp_c = c(1,-1) * c(beta[3], beta[4])
      } else if (sign_b[1] < 0 & sign_b[2] > 0){
        be_c = c(1,-1) * c(beta[1], beta[2])
        bp_c = c(1,1) * c(beta[3], beta[4])
      } else {
        be_c = c(1,-1) * c(beta[1], beta[2])
        bp_c = c(1,-1) * c(beta[3], beta[4])
      }
      # for one snp, if there are 2 other snp that have LD around (difference less than 0.01), we select it to our pool
      e_num<- mapply(get_pool, asplit(LDe, 2), r[1])
      p_num <- mapply(get_pool, asplit(LDe, 2), r[2])
      e_pool <- which(e_num > 2)
      p_pool <- which(p_num > 2)
      pool <- intersect(e_pool, p_pool)
      output = list()
      Ye_list = list()
      Yp_list = list()
      variant_e = list()
      variant_p = list()
      for (i in c(1:size)){
        n1 = sample(pool, 1)
        lde = abs(LDe[n1, ])
        ldp = abs(LDp[n1, ])
        pos_e = which(abs(lde - r[1]) < 0.05)
        pos_p = which(abs(ldp - r[2]) < 0.05)
        ne = sample(pos_e, 1)
        np = sample(pos_p, 1)
        while (ne == np){
          np = sample(pos_p, 1)
        }

        # set up the  genotype effect
        be = vector(mode="numeric", length=P)
        be[c(n1,ne)] = be_c
        Xe_c = Xe[, c(n1,ne)]
        mu_e <- Xe_c %*% be_c
        sigma2e <- var(mu_e) * (1-phi[1]) / phi[1]
        Ye <- apply(mu_e, 1, function(mu0) {return(rnorm(1, mu0, sigma2e))})

        bp = vector(mode="numeric", length=P)
        bp[c(n1,np)] = bp_c
        Xp_c = Xp[, c(n1,np)]
        mu_p <- Xp_c %*% bp_c
        sigma2p <- var(mu_p) * (1-phi[2]) / phi[2]
        Yp <- apply(mu_p, 1, function(mu0) {return(rnorm(1, mu0, sigma2p))})
        Ye_list[[i]] <- Ye
        Yp_list[[i]] <- Yp
        variant_e[[i]] <- c(n1,ne)
        variant_p[[i]] <- c(n1,np)
      }
      # set up the true causal variants based on LD
      output[["parameter"]] = list("total_size" = size,"r" = r, "phi" = phi, "beta" = beta, "sign_beta" = sign_b)
      output[["X"]][["Xe"]] = Xe
      output[["X"]][["Xp"]] = Xp
      output[["Y"]] = tibble::tibble("Ye" = Ye_list, "Yp" = Yp_list,
                                 "variant_e" = variant_e, "variant_p" = variant_p)
      return(output)
    }

    library(susieR)
    data(N3finemapping)
    attach(N3finemapping)
    data <- N3finemapping
    Xmat <- data$X
    pos <- sample(1:nrow(Xmat), 550)
    Xe <- Xmat[pos,]
    pos <- sample(1:nrow(Xmat), 550)
    Xp <- Xmat[pos,]
    data = simu_data(size = ${size}, Xe, Xp, r = c(${paths(r):,}), phi = c(${paths(phi):,}), beta = c(${paths(beta):,}), sign_b = c(${paths(sign_b):,}))
    saveRDS(data, ${_output:r})
    

## Example .sh file for generating simulation data

In [None]:
#!/bin/sh

module load Singularity

for r in 0.1 0.3 0.5 0.7 0.9
do
    for phi in 0.05 0.1 0.2 0.5 0.7
    do
        sos run /home/hs3393/coloc/Simulation.ipynb simulation \
        --cwd /home/hs3393/coloc/simulation \
        --beta 1.0 1.0 1.0 1.0 \
        --sign_b 1 1 \
        --r ${r} ${r} \
        --phi ${phi} ${phi} \
        --size 1000 --container /mnt/vast/hpc/csg/molecular_phenotype_calling/eqtl//containers/stephenslab.sif
    done
done

## Generate fine mapping result of simulation data, not finished

In [None]:
[finemap]
parameter: simufile = paths
parameter: cwd = path("output")
parameter: container = ""
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
input: simufile, group_by = 1
output: f"{cwd}/{_input:bn}.finemap_result.RDS"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container 
    library("dplyr")
    library("tibble")
    library("purrr")
    library("tidyr")
    library("readr")
    library("stringr")
    library("susieR")
    simu_result = readRDS("${_input:a}")
    for (i in c(1:length(simu_result$Y$Ye))){
    out_e[[i]] <- susie(simu_result$X$Xe, simu_result$Y$Ye[[i]])
    out_p[[i]] <- susie(simu_result$X$Xp, simu_result$Y$Yp[[i]])
    }
    

## Example .sh file for run fine mapping on simulation data

In [None]:
sos run /home/hs3393/coloc/Simulation.ipynb finemap \
    --cwd /home/hs3393/coloc/simulation \
    --simufile `ls /home/hs3393/coloc/simulation/simulation_data/*.rds` \
    --container /mnt/vast/hpc/csg/molecular_phenotype_calling/eqtl/containers/stephenslab.sif -n