# GWAS analysis with GLMM models using Regenie

This notebook implements pipelines for analyzing binary and quantitative traits association using REGENIE.

In [1]:
sos run regenie.ipynb -h

usage: sos run regenie.ipynb [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  regenie_qc
  regenie
  regenie_burden
  regenie_vc

Global Workflow Options:
  --cwd VAL (as path, required)
                        the output directory for generated files
  --sampleFile . (as path)
                        Path to sample file
  --bfile VAL (as path, required)
                        Genotype files in plink binary this is used for
                        computing the GRM
  --genoFile  paths('.')

                        Path to bgen or bed files
  --phenoFile VAL (as path, required)
                        Phenotype file
  --phenoCol VAL VAL ... (as type, required)
                        P

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# Path to sample file
parameter: sampleFile = path('.')
# Genotype files in plink binary this is used for computing the GRM
parameter: bfile = path
# Path to bgen or bed files 
parameter: genoFile = paths('.')
# Phenotype file 
parameter: phenoFile = path
# Phenotype to be analyzed (specify the column name)
parameter: phenoCol = list
# Covariate file path. Will use phenoFile if empty
parameter: covarFile = path('.')
# Summary statisticss format file path used for unifying output column names. Will not unify names if empty
parameter: formatFile = path('.')
# Qualitative covariates to be used in the analysis
parameter: covarCol = []
# Quantitative covariates to be used in the analysis
parameter: qCovarCol = []
# Specific number of threads to use
parameter: numThreads = 2
# Minimum MAF to be used
parameter: bgenMinMAF = 0.001
# Mimimum info score to be used
parameter: bgenMinINFO = 0.8
#Comma separated list of chromosomes to test in step 2
parameter: chrList = []
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Container to use
parameter: container = ''
if not covarFile.is_file():
    covarFile = phenoFile
cwd = path(f"{cwd:a}")

In [None]:
# Select the SNPs and samples to be used based on maf, geno, hwe and mind options
[regenie_qc]
parameter: maf_filter = 0.0
parameter: geno_filter = 0.0
parameter: hwe_filter = 0.0
parameter: mind_filter = 0.0
input: bfile
output: f'{cwd}/cache/{bfile:bn}.qc_pass.id', f'{cwd}/cache/{bfile:bn}.qc_pass.snplist' 
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: container= container, expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout' 
    plink2 \
      --bfile ${bfile:n} --mac 1 \
      ${('--maf %s' % maf_filter) if maf_filter > 0 else ''} ${('--geno %s' % geno_filter) if geno_filter > 0 else ''} ${('--hwe %s' % hwe_filter) if hwe_filter > 0 else ''} ${('--mind %s' % mind_filter) if mind_filter > 0 else ''} \
      --write-snplist --write-samples --no-id-header \
      --threads ${numThreads} \
      --out ${_output[0]:n} 

## Step 1. Fitting the null

In [None]:
# Run REGENIE step 1: fitting the null
[regenie_1,regenie_burden_1,regenie_vc_1]
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Path to temporarily store block predictions
parameter: lowmem_dir = cwd
# Specify that traits are binary with 0=control,1=case,NA=missing (default is quantitative)
parameter: trait = 'bt'
# extract and prepare phenotype & covariate files
import pandas as pd
import numpy as np
dat = pd.read_csv(phenoFile, header=0, sep=r'\s+', dtype=str)
dat = dat.replace(to_replace =np.nan, value ="NA")
if len(phenoCol) > 0:    
    dat.to_csv(f"{cwd}/{phenoFile:bn}.regenie_phenotype", sep='\t', index=False, columns = ['FID', 'IID'] + phenoCol)
dat = pd.read_csv(covarFile, header=0, sep=r'\s+', dtype=str)
if len(covarCol) > 0 or len(qCovarCol) > 0:
    dat = dat.dropna(subset=covarCol)
    dat = dat.dropna(subset=qCovarCol)
    dat.replace(to_replace =np.nan, value ="NA")
    dat1 = pd.DataFrame(dat, columns = ['FID','IID'] + covarCol)
    #dat1 = dat1.astype(int)
    dat2 = pd.DataFrame(dat, columns = ['IID'] + qCovarCol)
    merged_left = pd.merge(left=dat1, right=dat2, how='left', left_on='IID', right_on='IID')
    merged_left.to_csv(f"{cwd}/{phenoFile:bn}.regenie_covar", sep=' ', index=False)
depends: f'{cwd}/cache/{bfile:bn}.qc_pass.snplist', f'{cwd}/cache/{bfile:bn}.qc_pass.id'
input: geno = bfile, pheno = f"{cwd}/{phenoFile:bn}.regenie_phenotype", covar = f"{cwd}/{phenoFile:bn}.regenie_covar", qc = output_from("regenie_qc")
output: f'{cwd}/{phenoFile:bn}_' + "_".join([x for x in phenoCol]) + '.regenie_pred.list'
task: trunk_workers = 1, trunk_size = job_size, walltime = '12h', mem = '15G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: container= container, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    regenie \
      --step 1 \
      --bed ${_input["geno"]:n} \
      --phenoFile ${_input["pheno"]} \
      --covarFile ${_input["covar"]} \
      --keep ${_input["qc"][0]} \
      --extract ${_input["qc"][1]} \
      ${('--' + trait) if trait in ['bt'] else ''} \
      --bsize ${bsize} \
      --lowmem --lowmem-prefix ${lowmem_dir:a}/${_output:bn} \
      --threads ${numThreads} \
      --out ${_output:nn}.regenie

## Step 2: association analysis

In [None]:
# Run REGENIE step 2: association analysis
[regenie_2]
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Mimimum allele count to be used
parameter: minMAC = int
parameter: trait = 'bt'
input: genoFile, group_by = 1, group_with = dict(info=[(path(f'{cwd}/{phenoFile:bn}_' + "_".join([x for x in phenoCol]) + '.regenie_pred.list'))] * len(genoFile))
input_options = f"--bgen {_input} --sample {sampleFile}" if _input.suffix == ".bgen" else f"--bed {_input:n}"
output: [f'{cwd}/{_input:bn}_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))]
task: trunk_workers = 1, trunk_size = job_size, walltime = '12h', mem = '15G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: container= container, expand = "${ }", stderr = f'{cwd}/cache/{_input:bn}.stderr', stdout = f'{cwd}/cache/{_input:bn}.stdout'
    set -e
    regenie \
     --step 2 \
     ${input_options} \
     --phenoFile ${cwd}/${phenoFile:bn}.regenie_phenotype \
     --covarFile ${cwd}/${covarFile:bn}.regenie_covar \
     --phenoColList ${','.join(phenoCol)} \
     ${('--' + trait) if trait in ['bt'] else ''} \
     ${'--chrList ' + ','.join(['%s' % x for x in chrList if x is not None])} \
     --firth \
     --approx \
     --pred ${_input.info} \
     --bsize ${bsize} \
     --minMAC ${minMAC} \
     --minINFO ${bgenMinINFO}\
     --threads ${numThreads} \
     --out ${cwd}/${_input:bn} && \
     gzip -f --best ${_output:n}

## Regenie burden test

In [None]:
# Run regenie for rare variant aggregate tests
[regenie_burden_2,regenie_vc_2]
# Specify that traits are binary with 0=control,1=case,NA=missing (default is quantitative)
parameter: trait = 'bt'
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Annotation file format: variantID, gene and functional annotation (space/tab delimited)
parameter: anno_file = path
# This file lists variants within each set/gene to use when building masks. Format: set/gene name, chromosome, physical pos set/gene, then by a comma-separated list of variants included in the set/gene.
parameter: set_list = path
# Select specific genes/sets to test
parameter: keep_gene = path(".")
# Allele frequency file. format: variantId, alternative allele frequency
parameter: aaf_file = path(".")
# Select the annotations to be used in the mask file. format: mask# annotation type
parameter: mask_file = path(".")
# Select the upper MAF to generate masks
parameter: aaf_bins =[0.001]
# Specify the type of test to use
parameter: vc_tests =['skat','skato','skato-acat','acatv','acato','acato-full']
# Specify if you would like to only include variants whose AAF is below a given threshold
parameter: vc_maxAAF = 0.001
# Specify the MAc of variants to be collapsed into a burden mask which is then included in the tests instead of the individual variants.
parameter: vc_MACthr = 10 ##10 is the default
# The way in which the alternative alleles are counted
parameter: build_mask = 'max'
# Mimimum allele count to be used
parameter: minMAC = int
input: genoFile, group_by = 1, group_with = dict(info=[(path(f'{cwd}/{phenoFile:bn}_' + "_".join([x for x in phenoCol]) + '.regenie_pred.list'))] * len(genoFile))
input_options = f"--bgen {_input} --sample {sampleFile}" if _input.suffix == ".bgen" else f"--bed {_input:n}"
output: [f'{cwd}/{_input:bn}_rarevariant_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))]
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '15G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash:container= container, expand = "${ }", stderr = f'{cwd}/{_input:bn}.stderr', stdout = f'{cwd}/{_input:bn}.stdout'
    set -e
    regenie \
      --step 2 \
      ${input_options} \
      --phenoFile ${cwd}/${phenoFile:bn}.regenie_phenotype \
      --covarFile ${cwd}/${covarFile:bn}.regenie_covar \
      --phenoColList ${','.join(phenoCol)} \
      ${('--' + trait) if trait in ['bt'] else ''} \
      ${'--chrList ' + ','.join(['%s' % x for x in chrList if x is not None])} \
      ${("--extract-sets " + str(keep_gene)) if keep_gene.is_file() else ""} \
      --firth --approx \
      --pred ${_input.info} \
      --set-list ${set_list} \
      --anno-file ${anno_file} \
      --mask-def ${mask_file} \
      --aaf-bins ${",".join([str(x) for x in aaf_bins])}\
      ${('--build-mask ' + build_mask) if build_mask in ['max','sum','comphet'] else ''} \
      ${('--aaf-file ' + str(aaf_file)) if aaf_file.is_file() else ""}\
      ${'--vc-tests ' + ','.join(['%s' % x for x in vc_tests if x is not None])} \
      ${('--vc-maxAAF ' + str(vc_maxAAF)) if vc_tests is not None else ''} \
      ${('--vc-MACthr ' + str(vc_MACthr)) if vc_tests is not None else ''} \
      --singleton-carrier \
      --write-mask-snplist \
      --write-mask \
      --minMAC ${minMAC} \
      --bsize ${bsize} \
      --check-burden-files \
      --out  ${cwd}/${_input:bn}_rarevariant && \
      gzip -f --best ${_output:n}

In [None]:
[regenie_burden_3,regenie_vc_3]
# Select the annotations to be used in the mask file. format: mask# annotation type
parameter: mask_file = path(".")
# Select the upper MAF to generate masks
parameter: aaf_bins =[0.001]
aaf_bins = ['singleton'] + aaf_bins
f = open(mask_file, "r")
masks = [i.split(" ")[0] for i in f.readlines()]
output: [f'{cwd}/cache/{_input:bn}_'+ str(phenoCol[i]) + "_" + str(masks[j]) + "." + str(aaf_bins[k]) + '.regenie.gz' for i in range(len(phenoCol)) for j in range(len(masks)) for k in range(len(aaf_bins))] 
task: trunk_workers = 1, trunk_size = job_size, walltime = '12h', mem = '15G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
python: container= container, expand = "${ }", stderr = f'{cwd}/cache/{_input:bn}.stderr', stdout = f'{cwd}/cache/{_input:bn}.stdout'
    import pandas as pd 

    if len(${phenoCol}) == 1:
        _input = [${_input:r}]
    else:
        _input = [f'{i}' for i in "${_input}".split(" ")]

    allele_combos = [str(m) + "." + str(a) for m in ${masks} for a in ${aaf_bins}]
    for i, phen in enumerate(${phenoCol}):
        f = pd.read_csv(_input[i], compression='gzip', header=0, sep=r'\s+', quotechar='"', comment='#')
        f= f.fillna('NA')
        dfs = dict()
        for each in allele_combos:
            dfs[each] = f[f["ALLELE1"] == each]

        for df in dfs.keys():
            dfs[df].to_csv(f'${cwd}/cache/${_input:bn}_{phen}'+ "_" + df + '.regenie.gz', sep="\t", index=False)

## Merge results

In [None]:
# Merge results and log files
[regenie_3,regenie_burden_4,regenie_vc_4]
parameter:reverse_log_p = False
depends: formatFile
input: group_by = lambda x: [x[i::len(phenoCol)] for i in range(len(phenoCol))], group_with='phenoCol'
output: f'{cwd}/{phenoFile:bn}_{_phenoCol}.{step_name.rsplit("_",1)[0]}.snp_stats.gz',
        f'{cwd}/{phenoFile:bn}_{_phenoCol}.{step_name.rsplit("_",1)[0]}.snp_counts.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '1h', mem = '36G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
python: container= container, expand ='${ }'
    import gzip
    import pandas as pd

    # Check if formatFile exists
    if not ${formatFile.is_file()} and ${reverse_log_p}:
        raise ValueError("formatFile is missing, but reverse_log_p is set to True.")

    if ${formatFile.is_file()}:
        output = '${_output[0]:n}' + '_original_columns' + '${_output[0]:x}'
    else:
        output = '${_output[0]}'
   

    data = pd.concat([pd.read_csv(f, compression='gzip', header=0, sep=r'\s+', quotechar='"', comment='#') for f in [${_input:r,}]], ignore_index=True)
    data=data.fillna('NA')
    data.to_csv(output, compression='gzip', sep='\t', header = True, index = False)
    # unify output format
    if ${formatFile.is_file()} or ${reverse_log_p}:
        sumstats = pd.read_csv(output, compression='gzip', header=0, sep=r'\s+', quotechar='"')  
        if ${formatFile.is_file()}:
            import yaml
            config = yaml.safe_load(open(${formatFile:r}, 'r'))
        try:
            sumstats = sumstats.loc[:,list(config.values())]
        except:
            raise ValueError(f'According to ${formatFile}, input summary statistics should have the following columns: {list(config.values())}.')
        sumstats.columns = list(config.keys())
        if ${reverse_log_p}:
            sumstats['P'] = sumstats['P'].apply(lambda row: 10**-row)
        sumstats.to_csv(${_output[0]:r}, compression='gzip', sep='\t', header = True, index = False)        

bash: container= container, expand="$( )"
    # count result SNPs
    for f in $(_input); do echo "$f: `zcat $f | wc -l`"; done > $(_output[1])
    # merge stderr and stdout files
    for f in $(_input); do 
        for ext in stderr stdout log; do
            echo "$f $ext:"
            cat ${f%.gz}.$ext 2>/dev/null || true
            rm -f ${f%.gz}.$ext 
        done
    done > $(_output[0]:n).log

In [None]:
# Separate the mask and MAF bins and remove the single variant genes
[regenie_burden_5]
# Top k genes to be annotated
parameter: k = 10
# P value limitation for annotation
parameter: plim = 2.5E-6
# A given list of gene for annotation
parameter: genelist = ""
# Select the annotations to be used in the mask file. format: mask# annotation type
parameter: mask_file = path(".")
# Select the upper MAF to generate masks
parameter: aaf_bins = [0.001]
f = open(mask_file, "r")
masks = [i.split(" ")[0] for i in f.readlines()]
bins=[str(phenoCol[i]) + f'.{step_name.rsplit("_",1)[0]}.' + str(masks[j]) + "." + str(aaf_bins[k]) for i in range(len(phenoCol)) for j in range(len(masks)) for k in range(len(aaf_bins))]
input: [f'{cwd}/{phenoFile:bn}_' + str(phenoCol[i]) +f'.{step_name.rsplit("_",1)[0]}.snp_stats.gz' for i in range(len(phenoCol))]+[f'{cwd}/{phenoFile:bn}_' + str(phenoCol[i]) +f'.{step_name.rsplit("_",1)[0]}.snp_counts.txt' for i in range(len(phenoCol))],group_by = lambda x: [x[i::len(phenoCol)] for i in range(len(phenoCol))], group_with='phenoCol'
output: [f'{cwd}/{phenoFile:bn}_' + bins[n] + '.snp_stats.gz' for n in range(len(bins))]+[f'{cwd}/{phenoFile:bn}_' + bins[n] + '.snp_counts.txt' for n in range(len(bins))]
task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '64G', tags = f'{step_name}_{_input[0]:bn}'    
python: container= container, expand = "${ }", stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'
    import gzip
    import pandas as pd
    data=pd.read_csv(${_input[0]:r}, compression='gzip', header=0, sep=r'\s+', quotechar='"', comment='#')
    binlist=pd.unique(data.ALT)
    
    for bin in binlist:
        # Separate regenie results into mask and MAF bins
        data[data['ALT']==bin].to_csv("${_input[0]:nn}."+bin+'.snp_stats.gz', compression='gzip', sep='\t', header = True, index = False)
        
    
    count=pd.read_csv(${_input[1]:r}, header=None, sep=r'\s+')
    for bin in binlist:
        count[count[0].str.contains(bin)].to_csv("${_input[0]:nn}."+bin+'.snp_counts.txt', sep='\t', header = False, index = False)

In [None]:
# Separate the mask and MAF bins and remove the single variant genes
[regenie_vc_5]
# Top k genes to be annotated
parameter: k = 10
# P value limitation for annotation
parameter: plim = 2.5E-6
# A given list of gene for annotation
parameter: genelist = ""
# Select the annotations to be used in the mask file. format: mask# annotation type
parameter: mask_file = path(".")
# Select the upper MAF to generate masks
parameter: aaf_bins = [0.001]
import pandas as pd
f = open(mask_file, "r")
masks = [i.split(" ")[0] for i in f.readlines()]
bins=[str(phenoCol[i]) + f'.{step_name.rsplit("_",1)[0]}.' + str(masks[j]) + "." + str(aaf_bins[k]) for i in range(len(phenoCol)) for j in range(len(masks)) for k in range(len(aaf_bins))]
input: [f'{cwd}/{phenoFile:bn}_' + str(phenoCol[i]) + f'.{step_name.rsplit("_",1)[0]}.snp_stats.gz' for i in range(len(phenoCol))]+[f'{cwd}/{phenoFile:bn}_' + str(phenoCol[i]) +f'.{step_name.rsplit("_",1)[0]}.snp_counts.txt' for i in range(len(phenoCol))],group_by = lambda x: [x[i::len(phenoCol)] for i in range(len(phenoCol))], group_with='phenoCol'
df=pd.read_csv(_input[0], compression='gzip', header=0, sep=r'\s+', quotechar='"', comment='#')
vc=pd.unique(df['TEST'])
output: [f'{cwd}/{phenoFile:bn}_' + bins[n] +'_'+ vc[k] +'.snp_stats.gz' for n in range(len(bins)) for k in range(len(vc))]
task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '64G', tags = f'{step_name}_{_input[0]:bn}'    
python: container= container, expand = "${ }", stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'
    import gzip
    import pandas as pd
    data=pd.read_csv(${_input[0]:r}, compression='gzip', header=0, sep=r'\s+', quotechar='"', comment='#')
    grouped = data.groupby(['TEST', 'ALT'])
    # Iterate over groups and save each group to a separate file
    for (test_value, alt_value), group in grouped:
    #Sort the data in the summary stats based on position
        group=group.sort_values(by=['POS'])
    # Define the output file name based on TEST and ALLELE1 values
        output_file_name = f"${_input[0]:nn}.{alt_value}_{test_value}.snp_stats.gz"
    # Save the group to the output file
        group.to_csv(output_file_name, compression='gzip', sep='\t', index=False, header=True)


## Manhattan and QQ plots

Before running the pipeline make sure you have installed the necessary packages. We use the `qqman` package from R: https://www.r-graph-gallery.com/101_Manhattan_plot.html


In [None]:
# Manhattan and QQ plots using `qqman`
[regenie_4]
# Column name for BP
parameter: bp = 'POS'
# Column name for p-value
parameter: pval = 'P'
# Column name for SNP
parameter: snp = 'SNP'
# ylim set to 0 to use maximum -log10(p) in data
parameter: ylim = 0
sep = '\n\n---\n'
depends: phenoFile
input: group_by = 2, group_with = 'phenoCol'
output: manhattan = f'{_input[0]:nn}.manhattan.png',
        qq = f'{_input[0]:nn}.qq.png',
        analysis_summary = f'{_input[0]:nn}.analysis_summary.md'
task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '64G', tags = f'{step_name}_{_output[0]:bn}'    
bash: container= container, expand = "${ }"
    echo '''---
    theme: base-theme
    style: |
      img {
        height: 80%;
        display: block;
        margin-left: auto;
        margin-right: auto;
      }
    ---    
    ''' > ${_output[2]}
    
R:  container= container , expand='${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    # some summary statistics for phenotype
    pheno = read.table(${phenoFile:r}, header=T, sep = "")$${_phenoCol}
    if (length(unique(pheno))>2) {
      out = capture.output(summary(pheno))
    } else {
      out = as.data.frame(table(pheno))
      rownames(out) = c('n_ctrl', 'n_case')
      out = out[,2,drop=F]
    }
    write('# ${_phenoCol} result summary\n## Phenotype summary:\n```', ${_output[2]:r}, append = T)
    write.table(out, ${_output[2]:r}, append = T)
    write("```", ${_output[2]:r}, append = T)

R:  container= container, expand='${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    library('qqman')
    data <- read.table(gzfile('${_input[0]}'), sep='\t', header=T)
    lambda <- median(qchisq(1-data$${pval},1), na.rm=TRUE)/qchisq(0.5,1)
    ifelse((${ylim} == 0 && min(data$${pval}, na.rm=TRUE)!=0), ylim <- abs(floor(log10(min(data$${pval}, na.rm=TRUE)))), ylim <- abs(floor(log10(2.225074e-308))))
    # Creating manhattan plot
    png('${_output[0]}', width = 6, height = 4, unit='in', res=300)
    manhattan_plot <- manhattan(data, chr='CHR', bp='${bp}', snp='${snp}', p='${pval}', main = 'Manhattan plot for ${_phenoCol} (${step_name.rsplit("_",1)[0]})', ylim = c(0, ylim), cex = 0.6, 
    cex.axis = 0.9, col = c("blue4", "orange3"), chrlabs = c(1:22))
    dev.off()
    # Creating qqplot
    png('${_output[1]}', width = 5, height = 5, unit='in', res=300)
    qq_plot <- qq(data$${pval}, main = 'QQ Plot for ${_phenoCol} (${step_name.rsplit("_",1)[0]})', xlim = c(0, 8), ylim = c(0, ylim), pch = 18, col = "blue4", cex = 1.5, las = 1)
    dev.off()
    write('## p-value summary:', ${_output[2]:r}, append=T)
    write(paste("Genomic inflation factor is", round(lambda,3), "for", nrow(data), "variants analyzed.${sep}"), ${_output[2]:r}, append=T)
     
bash: expand = True
  set -e
  echo -e "# QQ plot for {_phenoCol}\n" >> {_output[2]}
  echo -e "![]({_output[1]:bn}.png){sep}" >> {_output[2]}
  echo -e "# Manhattan plot for {_phenoCol}\n" >> {_output[2]}
  echo -e "![]({_output[0]:bn}.png){sep}" >> {_output[2]}
  echo -e "# Result files\n\`\`\`" >> {_output[2]}
  ls {_input[0]:nn}.* | grep -v 'stderr|stdout'>> {_output[2]}
  echo -e "\`\`\`" >> {_output[2]}

In [None]:
# Manhattan and QQ plots using `qqman`
[regenie_vc_6]
# Column name for BP
parameter: bp = 'POS'
# Column name for p-value
parameter: pval = 'P'
# Column name for genes
parameter: snp = 'SNP'
# ylim set to 0 to use maximum -log10(p) in data
parameter: ylim = 0
# Select the upper MAF to generate masks
parameter: aaf_bins =[0.001]
# The vc_tests performed
parameter: vc_tests_reg =['ADD-SKAT','ADD-SKATO']
# Select the annotations to be used in the mask file. format: mask# annotation_type
parameter: mask_file = path(".")
f = open(mask_file, "r")
masks = [i.split(" ")[0] for i in f.readlines()]
bins=[str(phenoCol[i]) + f'.{step_name.rsplit("_",1)[0]}.' + str(masks[j]) + "." + str(aaf_bins[k]) for i in range(len(phenoCol)) for j in range(len(masks)) for k in range(len(aaf_bins))]
vc=[str(phenoCol[i]) + f'.{step_name.rsplit("_",1)[0]}.' + str(masks[j]) + "." + str(aaf_bins[k]) + "_" + str(vc_tests_reg[l]) for i in range(len(phenoCol)) for j in range(len(masks)) for k in range(len(aaf_bins)) for l in range(len(vc_tests_reg))]
print(vc)
sep = '\n\n---\n'
depends: phenoFile
input:[f'{cwd}/{phenoFile:bn}_' + bins[n] +'_'+ vc_tests_reg[k] +'.snp_stats.gz' for n in range(len(bins)) for k in range(len(vc_tests_reg))], group_by=1
output: manhattan = f'{_input[0]:nn}.manhattan.png',
        qq = f'{_input[0]:nn}.qq.png',
        analysis_summary = f'{_input[0]:nn}.analysis_summary.md'
task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '64G', tags = f'{step_name}_{_output[0]:bn}'   
bash: container= container,  expand = "${ }"
    echo '''---
    theme: base-theme
    style: |
      img {
        height: 80%;
        display: block;
        margin-left: auto;
        margin-right: auto;
      }
    ---    
    ''' > ${_output[2]}
    
R: container= container, expand='${ }', stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'
    # some summary statistics for phenotype
    tmp=unlist(strsplit(${_input[0]:r},"${cwd}/${phenoFile:bn}_"))[2]
    bins=unlist(strsplit(tmp,".snp_stats.gz"))[1]
    phenoCol=unlist(strsplit(bins,"\\."))[1]
    pheno = read.table(${phenoFile:r}, header=T, sep = "")[${phenoCol}]
    if (length(unique(pheno))>2) {
      out = capture.output(summary(pheno))
    } else {
      out = as.data.frame(table(pheno))
      rownames(out) = c('n_ctrl', 'n_case')
      out = out[,2,drop=F]
    }
    write(paste0('# ',bins,' result summary\n## Phenotype summary:\n```'), ${_output[2]:r}, append = T)
    write.table(out, ${_output[2]:r}, append = T)
    write("```", ${_output[2]:r}, append = T)

R:  container= container, expand='${ }', stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'
    tmp=unlist(strsplit(${_input[0]:r},"${cwd}/${phenoFile:bn}_"))[2]
    bins=unlist(strsplit(tmp,".snp_stats.gz"))[1]
    library('qqman')
    data <- read.table(gzfile('${_input[0]}'), sep='\t', header=T)
    lambda <- median(qchisq(1-data$${pval},1), na.rm=TRUE)/qchisq(0.5,1)
    ifelse((${ylim} == 0 && min(data$${pval}, na.rm=TRUE)!=0), ylim <- abs(floor(log10(min(data$${pval}, na.rm=TRUE)))), ylim <- abs(floor(log10(2.225074e-308))))
    # Creating manhattan plot
    png('${_output[0]}', width = 6, height = 4, unit='in', res=300)
    manhattan_plot <- manhattan(data, chr='CHR', bp='${bp}', snp='${snp}', p='${pval}', main = paste0('Manhattan plot for ',bins), ylim = c(0, ylim), cex = 0.6, 
    cex.axis = 0.9, col = c("blue4", "orange3"),  xlim=c(0,max(data$POS)*1.2))  #, chrlabs = as.character(c(1:22))
    dev.off()
    # Creating qqplot
    png('${_output[1]}', width = 5, height = 5, unit='in', res=300)
    qq_plot <- qq(data$${pval}, main = paste0('QQ plot for ',bins), xlim = c(0, 8), ylim = c(0, ylim), pch = 18, col = "blue4", cex = 1.5, las = 1)
    dev.off()
    write('## p-value summary:', ${_output[2]:r}, append=T)
    write(paste("Genomic inflation factor is", round(lambda,3), "for", nrow(data), "genes analyzed.${sep}"), ${_output[2]:r}, append=T)
    
bash: container= container, expand = True
  set -e
  echo -e "# QQ plot for {_input[0]:bnnn}\n" >> {_output[2]}
  echo -e "![]({_output[1]:bn}.png){sep}" >> {_output[2]}
  echo -e "# Manhattan plot for {_input[0]:bnnn}\n" >> {_output[2]}
  echo -e "![]({_output[0]:bn}.png){sep}" >> {_output[2]}
  echo -e "# Result files\n\`\`\`" >> {_output[2]}
  ls {_input[0]:nnn}.* | grep -vP 'stderr|stdout'>> {_output[2]}
  echo -e "\`\`\`" >> {_output[2]}

In [None]:
# Manhattan and QQ plots using `qqman`
[regenie_burden_6]
# Column name for BP
parameter: bp = 'POS'
# Column name for p-value
parameter: pval = 'P'
# Column name for genes
parameter: snp = 'SNP'
# ylim set to 0 to use maximum -log10(p) in data
parameter: ylim = 0
# Select the upper MAF to generate masks
parameter: aaf_bins =[0.001]
# Select the annotations to be used in the mask file. format: mask# annotation_type
parameter: mask_file = path(".")
f = open(mask_file, "r")
masks = [i.split(" ")[0] for i in f.readlines()]
bins=[str(phenoCol[i]) + f'.{step_name.rsplit("_",1)[0]}.' + str(masks[j]) + "." + str(aaf_bins[k]) for i in range(len(phenoCol)) for j in range(len(masks)) for k in range(len(aaf_bins))]
sep = '\n\n---\n'
depends: phenoFile
input: [f'{cwd}/{phenoFile:bn}_' + bins[n] + '.snp_stats.gz' for n in range(len(bins))], group_by=1
output: manhattan = f'{_input[0]:nn}.manhattan.png',
        qq = f'{_input[0]:nn}.qq.png',
        analysis_summary = f'{_input[0]:nn}.analysis_summary.md'
task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '64G', tags = f'{step_name}_{_output[0]:bn}'   
bash: container= container,  expand = "${ }"
    echo '''---
    theme: base-theme
    style: |
      img {
        height: 80%;
        display: block;
        margin-left: auto;
        margin-right: auto;
      }
    ---    
    ''' > ${_output[2]}
    
R: container= container, expand='${ }', stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'
    # some summary statistics for phenotype
    tmp=unlist(strsplit(${_input[0]:r},"${cwd}/${phenoFile:bn}_"))[2]
    bins=unlist(strsplit(tmp,".snp_stats.gz"))[1]
    phenoCol=unlist(strsplit(bins,"\\."))[1]
    pheno = read.table(${phenoFile:r}, header=T, sep = "")[${phenoCol}]
    if (length(unique(pheno))>2) {
      out = capture.output(summary(pheno))
    } else {
      out = as.data.frame(table(pheno))
      rownames(out) = c('n_ctrl', 'n_case')
      out = out[,2,drop=F]
    }
    write(paste0('# ',bins,' result summary\n## Phenotype summary:\n```'), ${_output[2]:r}, append = T)
    write.table(out, ${_output[2]:r}, append = T)
    write("```", ${_output[2]:r}, append = T)

R:  container= container, expand='${ }', stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'
    tmp=unlist(strsplit(${_input[0]:r},"${cwd}/${phenoFile:bn}_"))[2]
    bins=unlist(strsplit(tmp,".snp_stats.gz"))[1]
    library('qqman')
    data <- read.table(gzfile('${_input[0]}'), sep='\t', header=T)
    lambda <- median(qchisq(1-data$${pval},1), na.rm=TRUE)/qchisq(0.5,1)
    ifelse((${ylim} == 0 && min(data$${pval}, na.rm=TRUE)!=0), ylim <- abs(floor(log10(min(data$${pval}, na.rm=TRUE)))), ylim <- abs(floor(log10(2.225074e-308))))
    # Creating manhattan plot
    png('${_output[0]}', width = 6, height = 4, unit='in', res=300)
    manhattan_plot <- manhattan(data, chr='CHR', bp='${bp}', snp='${snp}', p='${pval}', main = paste0('Manhattan plot for ',bins), ylim = c(0, ylim), cex = 0.6, 
    cex.axis = 0.9, col = c("blue4", "orange3"), xlim=c(0,max(data$POS)*1.2))  #, chrlabs = as.character(c(1:22))
    dev.off()
    # Creating qqplot
    png('${_output[1]}', width = 5, height = 5, unit='in', res=300)
    qq_plot <- qq(data$${pval}, main = paste0('QQ plot for ',bins), xlim = c(0, 8), ylim = c(0, ylim), pch = 18, col = "blue4", cex = 1.5, las = 1)
    dev.off()
    write('## p-value summary:', ${_output[2]:r}, append=T)
    write(paste("Genomic inflation factor is", round(lambda,3), "for", nrow(data), "genes analyzed.${sep}"), ${_output[2]:r}, append=T)
    
bash: container= container, expand = True
  set -e
  echo -e "# QQ plot for {_input[0]:bnnn}\n" >> {_output[2]}
  echo -e "![]({_output[1]:bn}.png){sep}" >> {_output[2]}
  echo -e "# Manhattan plot for {_input[0]:bnnn}\n" >> {_output[2]}
  echo -e "![]({_output[0]:bn}.png){sep}" >> {_output[2]}
  echo -e "# Result files\n\`\`\`" >> {_output[2]}
  ls {_input[0]:nnn}.* | grep -vP 'stderr|stdout'>> {_output[2]}
  echo -e "\`\`\`" >> {_output[2]}