# SuSiE RSS

Bayesian sum of single-effect (SuSiE) linear regression using z scores

After applying LD_Clumping.ipynb and Region_Extraction.ipynb to select regions that overlap between traits, the current pipeline focuses on SuSiE to do fine mapping of those regions to see if theres something of interest.

To run this notebook follow the example:

```
sos run SuSiE_RSS.ipynb \
    --cwd /gpfs/gibbs/pi/dewan/data/UKBiobank/results/fine_mapping/f3393_hearing_aid \
    --region_dir /gpfs/gibbs/pi/dewan/data/UKBiobank/results/region_extraction/f3393_hearing_aid \
    --region_file /gpfs/gibbs/pi/dewan/data/UKBiobank/results/region_extraction/f3393_hearing_aid/regions.txt \
    --sumstats_path /gpfs/gibbs/pi/dewan/data/UKBiobank/results/FastGWA_results/results_imputed_data/f3393_hearing_aid/*.snp_stats.gz \
    --container_lmm /home/dc2325/scratch60/lmm_v_1_4.sif \
    --container_marp /gpfs/gibbs/pi/dewan/data/UKBiobank/marp.sif -s build
```

In [1]:
[global]
# Path to region extraction files
parameter: region_dir = path
#The region file after LD clumping
parameter: region_file = path
parameter: sumstats_path = path
#The directory for output files
parameter: cwd = path
## The container with the lmm/marp software. Can be either a dockerhub image or a singularity `sif` file.
parameter: container_lmm = 'statisticalgenetics/lmm:2.0'
parameter: container_marp = 'gaow/marp'
# Specific number of threads to use
parameter: numThreads = 2
# the pip probability threshold for variant to be considered interesting
parameter: pip_cutoff = 0.1
# the coverage needed for a CS
parameter: coverage = 0.95

fail_if(not region_file.is_file(), msg = 'Cannot find regions to fine map. Please specify them using ``--region-file`` option.')
# Load all regions of interest. Each item in the list will be a region: (chr, start, end)
regions = [x.strip() for x in open(region_file).readlines()]
regions = [x.replace(' ', '_' ) for x in regions]

In [1]:
[default_1]
input: [(f"{region_dir}/{x}/{sumstats_path:bn}_{x}.sumstats.gz", f"{region_dir}/{x}/{sumstats_path:bn}_{x}.sample_ld.gz") for x in regions], group_by = 2
output: [f'{cwd}/{x}.{sumstats_path:bnn}.SuSiE_RSS.rds' for x in regions], group_by=1
task: trunk_workers = 1, trunk_size = job_size, walltime = '12h', mem = '20G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: container=container_lmm, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
  
    ConvertP2Z <- function(pval, beta) {
      z <- abs(qnorm(pval / 2))
      z[which(beta < 0)] <- -1 * z[which(beta < 0)]
      return(z)
    }

    fixCorMatrix = function(z, R, maf = NULL, maf_thresh = 0, z_ld_weight = 0, null_weight = NULL, is_cov = FALSE) {
        # ignoring the R dimension check because we assume susie_rss was able to properly run
        # use the same R here that you added to susie_rss

        # MAF filter.
        if (!is.null(maf)) {
            if (length(maf) != length(z))
                stop(paste0("The length of maf does not agree with expected ",length(z)))
            id = which(maf > maf_thresh)
            R = R[id,id]
            z = z[id]
        }

        # Check for NAs in R.
        if (any(is.na(R)))
            stop("R matrix contains missing values")

        # Modify R as needed.
        # this is no longer recommended with current susieR implementation
        if (z_ld_weight > 0) {
            R = susieR:::muffled_cov2cor((1-z_ld_weight)*R + z_ld_weight * tcrossprod(z))
            R = (R + t(R))/2
        }

        if (is.numeric(null_weight) && null_weight == 0)
            null_weight = NULL
        if (!is.null(null_weight)) {
            if (!is.numeric(null_weight))
                stop("Null weight must be numeric")
            if (null_weight < 0 || null_weight >= 1)
                stop("Null weight must be between 0 and 1")
            R = cbind(rbind(R,0),0)
            z = c(z,0)
        }
        if (is_cov) {
            # Convert any input R to correlation matrix.
            # If R has 0 colums and rows, cov2cor produces NaN and warning.
            X0 = diag(R) == 0
            R = susieR:::muffled_cov2cor(R)
            if (sum(X0) > 0)
                R[X0,] = R[,X0] = 0
        }
        return(list(R=R,z=z))
    }
  
    sumstat = read.csv(${_input[0]:r}, sep = '\t', header=T,stringsAsFactors=F)
    sumstat$Z = as.double(ConvertP2Z(sumstat$P, sumstat$BETA))
    ld = as.matrix(read.csv(${_input[1]:r}, sep = '\t', header=T, stringsAsFactors=F))  
    R = fixCorMatrix(sumstat$Z, ld)$R
    res = susieR::susie_rss(as.double(sumstat$Z), ld, L = 10, coverage = ${coverage})

    res$pos = as.integer(sumstat$POS)
    res$z = as.double(sumstat$Z)
    res$p = as.double(sumstat$P)
    res$var_names = sumstat$SNP
    res$chr = as.integer(sumstat$CHR)
    res$ref = sumstat$REF
    res$alt = sumstat$ALT
    
    corr = susieR:::get_cs_correlation(res, X = NULL, Xcorr = R, max = FALSE)
    rownames(corr) <- names(res$cs)
    colnames(corr) <- names(res$cs)
    
    res$cscorr = corr
    
    if (length(res$sets$cs) > 1) {
        index_combos = expand.grid(1:length(res$sets$cs),1:length(res$sets$cs))
        in_common = apply(index_combos, 1, function(x) intersect(res$sets$cs[[x[1]]], res$sets$cs[[x[2]]]))
        counts = unlist(lapply(in_common, length))
  
        ovlp_mat = matrix(counts, ncol = length(res$sets$cs), byrow = T)
        ovlp_mat[lower.tri(ovlp_mat)] = NA
        rownames(ovlp_mat) = names(res$sets$cs)
        colnames(ovlp_mat) = names(res$sets$cs)
        print(ovlp_mat)
        res$sets[["ovlp_mat"]] = ovlp_mat
    }
    
    saveRDS(res, ${_output:r})

In [1]:
[default_2]
output: pip_plot = f"{cwd}/{_input:bn}.png"
task: trunk_workers = 1, trunk_size = job_size, walltime = '12h', mem = '20G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: container=container_lmm, expand = "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    res = readRDS(${_input:r})
    png(${_output[0]:r}, width = 14, height=6, unit='in', res=300)
    par(mfrow=c(1,2))
    susieR::susie_plot(res, y= "PIP", pos=list(attr='pos',start=res$pos[1],end=res$pos[length(res$pos)]), add_legend=T, xlab="position")
    susieR::susie_plot(res, y= "z", pos=list(attr='pos',start=res$pos[1],end=res$pos[length(res$pos)]), add_legend=T, xlab="position", ylab="-log10(p)")
    dev.off()

In [1]:
[default_3]
sep = "" #'\n\n---\n'
input: group_by = 'all'
output: analysis_summary = f'{cwd}/{sumstats_path:bnn}.analysis_summary.md', causalvariants_csv = f'{cwd}/{sumstats_path:bnn}.causalvariants.csv', allvars_csv = f'{cwd}/{sumstats_path:bnn}.allvariants.csv'
python: container=container_lmm, expand = "${ }"

    theme = '''---
    theme: base-theme
    style: |
     p {
       font-size: 24px;
       height: 900px;
       margin-top:1cm;
      }
      img {
        height: 70%;
        display: block;
        margin-left: auto;
        margin-right: auto;
      }
      body {
       margin-top: auto;
       margin-bottom: auto;
       font-family: verdana;
      }
    ---    
    '''
    import numpy as np
    import pandas as pd
    
    # will load the rds file outputted in a previous step
    def load_rds(filename, types=None):
        import os
        import pandas as pd, numpy as np
        import rpy2.robjects as RO
        import rpy2.robjects.vectors as RV
        import rpy2.rinterface as RI
        from rpy2.robjects import numpy2ri
        numpy2ri.activate()
        from rpy2.robjects import pandas2ri
        pandas2ri.activate()
        def load(data, types, rpy2_version=3):
            if types is not None and not isinstance(data, types):
                return np.array([])
            # FIXME: I'm not sure if I should keep two versions here
            # rpy2_version 2.9.X is more tedious but it handles BoolVector better
            # rpy2 version 3.0.1 converts bool to integer directly without dealing with
            # NA properly. It gives something like (0,1,-234235).
            # Possibly the best thing to do is to open an issue for it to the developers.
            if rpy2_version == 2:
                # below works for rpy2 version 2.9.X
                if isinstance(data, RI.RNULLType):
                    res = None
                elif isinstance(data, RV.BoolVector):
                    data = RO.r['as.integer'](data)
                    res = np.array(data, dtype=int)
                    # Handle c(NA, NA) situation
                    if np.sum(np.logical_and(res != 0, res != 1)):
                        res = res.astype(float)
                        res[res < 0] = np.nan
                        res[res > 1] = np.nan
                elif isinstance(data, RV.FactorVector):
                    data = RO.r['as.character'](data)
                    res = np.array(data, dtype=str)
                elif isinstance(data, RV.IntVector):
                    res = np.array(data, dtype=int)
                elif isinstance(data, RV.FloatVector):
                    res = np.array(data, dtype=float)
                elif isinstance(data, RV.StrVector):
                    res = np.array(data, dtype=str)
                elif isinstance(data, RV.DataFrame):
                    res = pd.DataFrame(data)
                elif isinstance(data, RV.Matrix):
                    res = np.matrix(data)
                elif isinstance(data, RV.Array):
                    res = np.array(data)
                else:
                    # I do not know what to do for this
                    # But I do not want to throw an error either
                    res = str(data)
            else:
                if isinstance(data, RI.NULLType):
                    res = None
                else:
                    res = data
            if isinstance(res, np.ndarray) and res.shape == (1, ):
                res = res[0]
            return res
        def load_dict(res, data, types):
            '''load data to res'''
            names = data.names if not isinstance(data.names, RI.NULLType) else [
                i + 1 for i in range(len(data))
            ]
            for name, value in zip(names, list(data)):
                if isinstance(value, RV.ListVector):
                    res[name] = {}
                    res[name] = load_dict(res[name], value, types)
                else:
                    res[name] = load(value, types)
            return res
        #
        if not os.path.isfile(filename):
            raise IOError('Cannot find file ``{}``!'.format(filename))
        rds = RO.r['readRDS'](filename)
        if isinstance(rds, RV.ListVector):
            res = load_dict({}, rds, types)
        else:
            res = load(rds, types)
        return res
    
    def f7(seq):
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x in seen or seen_add(x))]



    text = ""
    sep = '\n\n---\n'
    
    inp = "${_input:r}".split(" ")
    for i, each in enumerate(inp):
        inp[i] = ".".join(each.split(".")[:-1])

    r = f7("${_input:bn}".split(" "))
    
    num_csets = []
    region_info = []
    
    # this will be a 2d array that stores information about each variant of interest in the phenotype
    # this includes all the variants in a cs and all the variants past the cutoff
    causalvariant_info = []
    allvars_info = []

    for reg_i, each in enumerate(f7(inp)):
    
        rid = r[reg_i].split('.')[0]
        
        text_temp = ""
        text_temp += "#\n\n SuSiE RSS {region} \n".format(region=r[reg_i])
        text_temp += "![]({region}.png){sep} \n \n".format(region=r[reg_i], sep=sep)

        rd = load_rds(each[1:]+".rds")
        
        # find the number of cs in the current region
        if rd["sets"]["cs"] == None:
            num_csets.append(0)
        else:
            num_csets.append(len(rd["sets"]["cs"]))
        print(num_csets)
        
        # this will store the indicies of all variants that cross the threshold
        ind_p = []
        allvars = []

        pval = ${pip_cutoff}

        for i, each in enumerate(rd["pip"]):
            if each >= pval:
                ind_p.append(i)
            allvars.append(i)
                
        sumvars = 0
        
        # if we have at least one cs in the current region
        if num_csets[reg_i] > 0:
            tbl_header = "| chr number | pos at highest pip | ref | alt | region id | cs | highest pip |  \n"
            tbl_header += "| --- | --- | --- | --- | --- | --- | --- |  \n"

            table = ""
            
            sumpips = 0
            
            for cset in rd["sets"]["cs"].keys():
                print(cset)
                
                # if we have many variants in the cs
                if isinstance(rd["sets"]["cs"][cset], np.ndarray):
                    highestpip = 0
                    poswhighestpip = -1
                    for i in rd["sets"]["cs"][cset]:
                        i = i.item() - 1
                        
                        # we make sure that ind_p only stores the variants that aren't in any cs
                        if i in ind_p: ind_p.remove(i) 
                        if i in allvars: allvars.remove(i)
                        
                        # append variant info
                        causalvariant_info.append( [rd["chr"][i], rd["pos"][i], rd["ref"][i], rd["alt"][i], rid, cset, rd["pip"][i]] )
                        allvars_info.append( [rd["chr"][i], rd["pos"][i], rd["ref"][i], rd["alt"][i], rid, cset, rd["pip"][i]] )
                        
                        if rd["pip"][i] > highestpip:
                            highestpip = rd["pip"][i]
                            poswhighestpip = i
                            
                        sumpips += rd["pip"][i]
                        sumvars += 1
                        
                    if poswhighestpip > -1:
                        i = poswhighestpip
                        table += "| {chr} | {pos} | {ref} | {alt} | {rid} | {cs} | {pip:.2f} |  \n".format(chr=rd["chr"][i], pos=rd["pos"][i], ref=rd["ref"][i], alt=rd["alt"][i], rid=rid, cs=cset, pip=rd["pip"][i])
                
                else: # if we have only one variant in the cs
                    i =  rd["sets"]["cs"][cset]
                    i = i.item() - 1
                    
                    # we make sure that ind_p only stores the variants that aren't in any cs
                    if i in ind_p: ind_p.remove(i)
                    if i in allvars: allvars.remove(i)
                    
                    # append variant info
                    causalvariant_info.append( [rd["chr"][i], rd["pos"][i], rd["ref"][i], rd["alt"][i], rid, cset, rd["pip"][i]] )
                    allvars_info.append( [rd["chr"][i], rd["pos"][i], rd["ref"][i], rd["alt"][i], rid, cset, rd["pip"][i]] )
                    
                    table += "| {chr} | {pos} | {ref} | {alt} | {rid} | {cs} | {pip:.2f} |  \n".format(chr=rd["chr"][i], pos=rd["pos"][i], ref=rd["ref"][i], alt=rd["alt"][i], rid=rid, cs=cset, pip=rd["pip"][i])
                    
                    sumpips += rd["pip"][i]
                    sumvars += 1
            

            text_temp += "- Total number of variants: {}\n".format(len(rd["pip"]))
            text_temp += "- Expected number of causal variants: {:.2f}\n".format(sumpips)
            text_temp += "- Number of variants with PIP > {} and not in any CS: {}\n\n".format(pval, len(ind_p))
            text_temp += tbl_header + table + sep
            
            if num_csets[reg_i] > 1:
                text_temp += "#### CORR: Correlation between CS | OLAP: Overlap between CS\n"
                
                cs = list(rd["sets"]["cs"].keys())

                corrheader = "|  |"
                corrbreak = "| --- |"

                for i in cs:
                    corrheader += " CORR {} |".format(i)
                    corrbreak += " --- |"
                    
                corrheader += "  |"
                corrbreak += " --- |"
                    
                for i in cs:
                    corrheader += " OLAP {} |".format(i)
                    corrbreak += " --- |"

                corrheader += "\n"
                corrbreak += "\n"

                body = ""

                for en, i in enumerate(cs):
                    body += "| {} |".format(i)
                    for j in rd["cscorr"][en]:
                        body += " {:.2f} |".format(j)
                    body += "  |"
                    for j in rd["sets"]["cs"]:
                        body += " {} |".format(len(np.intersect1d(rd["sets"]["cs"][i], rd["sets"]["cs"][j])))
                    body += "\n"
                
                text_temp += corrheader + corrbreak + body + sep
            
        region_info.append(text_temp)
            
    f = open(${_output["analysis_summary"]:r}, "w")
    
    cset_order = np.argsort(num_csets)
    cset_order = cset_order.tolist()
    cset_order.reverse()
    for c in cset_order:
        text += region_info[c]
    
    f.write(theme + text)
    
    f.close()
    
    for i in ind_p:
        # append variant info
        causalvariant_info.append( [rd["chr"][i], rd["pos"][i], rd["ref"][i], rd["alt"][i], rid, "None", rd["pip"][i]] )
    for i in allvars:
        allvars_info.append( [rd["chr"][i], rd["pos"][i], rd["ref"][i], rd["alt"][i], rid, "None", rd["pip"][i]] )
        
    df = pd.DataFrame(causalvariant_info, columns=["chr", "pos", "ref", "alt", "rid", "cs", "pip"])
    df.to_csv(${_output["causalvariants_csv"]:r}, sep = ",", header = True, index = False)

    df = pd.DataFrame(allvars_info, columns=["chr", "pos", "ref", "alt", "rid", "cs", "pip"])
    df.to_csv(${_output["allvars_csv"]:r}, sep = ",", header = True, index = False)

In [None]:
# Generate analysis report: HTML file, and optionally PPTX file
[default_4]
output: f"{_input['analysis_summary']:n}.html"
sh: container=container_marp, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    node /opt/marp/.cli/marp-cli.js ${_input['analysis_summary']} -o ${_output:a} \
        --title '${region_file:bnn} fine mapping analysis' \
        --allow-local-files
    node /opt/marp/.cli/marp-cli.js ${_input['analysis_summary']} -o ${_output:an}.pptx \
        --title '${region_file:bnn} fine mapping analysis' \
        --allow-local-files