Skip to content

Commit

Permalink
Merge pull request #29 from genxnetwork/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
alex-medvedev-msc committed Sep 28, 2021
2 parents cfaa4cf + 33ded70 commit 4baa699
Show file tree
Hide file tree
Showing 36 changed files with 880 additions and 211 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ results
vcf
test_data
.snakemake
datastore
43 changes: 43 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
version: 1.2
workflows:
- name: GRAPE preprocessing
subclass: CWL
publish: true
primaryDescriptorPath: workflows/preprocess2/cwl/preprocess.cwl
testParameterFiles:
- workflows/preprocess2/cwl/config.json

- name: GRAPE reference downloading
subclass: CWL
publish: true
primaryDescriptorPath: workflows/reference/cwl/ref.cwl
testParameterFiles:
- workflows/reference/cwl/config.json

- name: GRAPE reference minimal downloading
subclass: CWL
publish: true
primaryDescriptorPath: workflows/reference/cwl/ref_min.cwl
testParameterFiles:
- workflows/reference/cwl/config.json

- name: GRAPE bundle downloading
subclass: CWL
publish: true
primaryDescriptor: workflows/bundle/cwl/bundle.cwl
testParameterFiles:
- workflows/bundle/cwl/config.json

- name: GRAPE bundle minimal downloading
subclass: CWL
publish: true
primaryDescriptor: workflows/bundle/cwl/bundle_min.cwl
testParameterFiles:
- workflows/bundle/cwl/config.json

- name: GRAPE simulation
subclass: CWL
publish: true
primaryDescriptorPath: workflows/pedsim/cwl/simulation.cwl
testParameterFiles:
- workflows/pedsim/cwl/config.json
48 changes: 30 additions & 18 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
mode: all
flow: ibis
ibis_seg_len: 7.0
ibis_min_snp: 500
zero_seg_count: 0.5
zero_seg_len: 5.0
alpha: 0.01
samples_file: samples.tsv
vcf_file: vcf/merged.vcf.gz
use_simulated_ibd: False
use_rapid: False
use_ibis: False
ref_dir: /media/ref
azure_public_key: "?sv=2020-08-04&ss=f&srt=sco&sp=r&se=2022-08-08T14:35:53Z&st=2021-08-27T06:35:53Z&spr=https&sig=SjxrSn2KBuQYjYgT2ZZTHQ6IOhA%2BRUSvLIgog%2FH2Tnk%3D"
1000g_public_key: "?sv=2019-10-10&si=prod&sr=c&sig=9nzcxaQn0NprMPlSh4RhFQHcXedLQIcFgbERiooHEqM%3D"
reference:
GRCh37_fasta:
file: human_g1k_v37.fasta
url: ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz
url: https://dataset1000genomes.blob.core.windows.net/dataset/technical/reference/human_g1k_v37.fasta.gz
filesize: 892331003
md5: 45f81df94f0408d082363e34a081ed81
mirror:
- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz
GRCh37_fasta_fai:
file: human_g1k_v37.fasta.fai
url: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.fai
url: https://dataset1000genomes.blob.core.windows.net/dataset/technical/reference/human_g1k_v37.fasta.fai
filesize: 2746
md5: 772484cc07983aba1355c7fb50f176d4
mirror:
- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.fai
GENETIC_MAP:
file: tables/genetic_map_hg19_withX.txt.gz
url: https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/tables/genetic_map_hg19_withX.txt.gz
Expand All @@ -29,12 +33,12 @@ reference:
- https://data.broadinstitute.org/alkesgroup/Eagle/downloads/tables/genetic_map_hg19_withX.txt.gz
genetic_map_GRCh37:
file: genetic_map_GRCh37/genetic_map_GRCh37_chr{chrom}.txt
url: ftp://ftp.ncbi.nlm.nih.gov/hapmap/recombination/2011-01_phaseII_B37/genetic_map_HapMapII_GRCh37.tar.gz
url: https://ftp-trace.ncbi.nih.gov/hapmap/recombination/2011-01_phaseII_B37/genetic_map_HapMapII_GRCh37.tar.gz
filesize: 37730100
md5: 1bc10a34d985e68e1f38ceb137b87929
vcfRef:
file: 1000genome/bcf/1000genome_chr{chrom}.bcf
url: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/bcf_files/ALL.chr$chrom.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf
url: https://dataset1000genomes.blob.core.windows.net/dataset/release/20130502/supporting/bcf_files/ALL.chr$chrom.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf
md5: [
'01aa2085294f4ea536ffcbe10d358799', # 1
'a80e9a4c5803780a31af7f2792a8b477', # 2
Expand Down Expand Up @@ -67,7 +71,7 @@ reference:
file: Minimac/{chrom}.1000g.Phase3.v5.With.Parameter.Estimates.m3vcf.gz
lift_chain:
file: hg38ToHg19.over.chain.gz
url: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz
url: http://hgdownload.cse.ucsc.edu/goldenpath/hg38/liftOver/hg38ToHg19.over.chain.gz
filesize: 1246411
md5: ff3031d93792f4cbb86af44055efd903
cmmap:
Expand All @@ -77,7 +81,7 @@ reference:
md5: 8d7d0bf2bfd13a39cad0e8d5012666a2
SITE_1000GENOME:
file: 1000genome/allele_info/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.only_rs.biallelic.tab
url: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz
url: https://dataset1000genomes.blob.core.windows.net/dataset/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz
filesize: 1458224240
md5: d9c22df30ea810d5c7d98391e482d1a3
hapmap_ped:
Expand All @@ -86,21 +90,29 @@ reference:
file: hapmap/ceu.map
hapmap_fam:
file: hapmap/relations.txt
url: ftp://ftp.ncbi.nlm.nih.gov/hapmap/phase_3/relationships_w_pops_121708.txt
url: https://ftp-trace.ncbi.nih.gov/hapmap/phase_3/relationships_w_pops_121708.txt
filesize: 36765
md5: a70333f0e0bd9d5a72a3a253895228b1
hd_genotype_chip:
file: 1000genome/hd_genotype_chip/all.vcf.gz
url: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/hd_genotype_chip/ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.vcf.gz
filesize: 1331157654
md5: 620462f7d8ffc9b82e1fff90c3aecd37
affymetrix_chip:
file: 1000genome/affymetrix_chip/all.vcf.gz
url: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/hd_genotype_chip/ALL.wgs.nhgri_coriell_affy_6.20140825.genotypes_has_ped.vcf.gz
url: https://dataset1000genomes.blob.core.windows.net/dataset/release/20130502/supporting/hd_genotype_chip/ALL.wgs.nhgri_coriell_affy_6.20140825.genotypes_has_ped.vcf.gz
filesize: 782884333
md5: 019d00c28c1ff60906f72c11f7997c4c
pedsim_map:
file: refined_mf.simmap
url: https://github.com/cbherer/Bherer_etal_SexualDimorphismRecombination/raw/master/Refined_genetic_map_b37.tar.gz
filesize: 41278300
md5: 39e6e8620d616362875f2538eae2f279
md5: 39e6e8620d616362875f2538eae2f279
bundle:
file: ref.tar.gz
url: https://bioinformatics.file.core.windows.net/bundles/ref.tar.gz
filesize: 18582485368
md5: 67278f83139f375e22bd56544d523fa3
bundle_min:
file: ref_min.tar.gz
url: https://bioinformatics.file.core.windows.net/bundles/ref_min.tar.gz
filesize: 3074816502
md5: 15002b7eb18e3b991c16ef1131c4cc42
picard:
file: picard.jar
url: https://github.com/broadinstitute/picard/releases/download/2.26.2/picard.jar
8 changes: 7 additions & 1 deletion containers/snakemake/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,20 @@ ENV SHELL /bin/bash

#ENV CONDA_PKGS_DIR /tmp/conda/pgks

RUN apt-get install -y wget bzip2 gnupg2 git && \
RUN apt-get install -y wget bzip2 gnupg2 git openjdk-11-jre openjdk-11-jdk libgomp1 && \
wget -nv https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
rm Miniconda3-latest-Linux-x86_64.sh && \
conda install -c conda-forge mamba && \
mamba env create -f envs/snakemake.yaml && \
conda clean --all -y

RUN wget "https://bioinformatics.file.core.windows.net/bundles/Minimac3Executable.tar.gz?sv=2020-08-04&ss=f&srt=sco&sp=r&se=2022-08-08T14:35:53Z&st=2021-08-27T06:35:53Z&spr=https&sig=SjxrSn2KBuQYjYgT2ZZTHQ6IOhA%2BRUSvLIgog%2FH2Tnk%3D" -O Minimac3Executable.tar.gz

RUN tar -xzvf Minimac3Executable.tar.gz

ENV PATH "$PATH:/Minimac3Executable/bin"

# Workaround of NonWritableError when conda tries to create environments for the first time
# funnel launches docker containers with --read-only and snakemake cannot create conda envs
# because it has to do something with urls.txt
Expand Down
5 changes: 5 additions & 0 deletions envs/ersa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name: ersa
channels:
- b3bg
dependencies:
- ersa
5 changes: 5 additions & 0 deletions envs/ibis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name: ibis
channels:
- b3bg
dependencies:
- ibis
5 changes: 5 additions & 0 deletions envs/ped-sim.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name: ped-sim
channels:
- b3bg
dependencies:
- ped-sim==1.3
3 changes: 2 additions & 1 deletion envs/snakemake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ dependencies:
- seaborn==0.10.1
- biom-format==2.1.8
- scikit-bio==0.5.6
- docutils==0.16
- docutils==0.16
- mmh3==3.0.0
59 changes: 45 additions & 14 deletions launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import argparse
import shutil
import os
from inspect import getsourcefile

# Returns an integer value for total available memory, in GB.
def total_memory_gb():
Expand Down Expand Up @@ -203,6 +204,11 @@ def get_parser_args():
default="ceph_unrelated_all.tsv",
help="List of samples from 1000genomes for pedsim to use as founders. You can choose only from 'ceph_unrelated_all.tsv', 'all.tsv'")

parser.add_argument(
"--use-singularity",
help="If this argument is present, Snakemake will use Singularity for the containerization environment",
action="store_true")

# --singularity-prefix /tmp --singularity-args='-B /media:/media -B /tmp:/tmp -W /tmp' --conda-prefix /tmp
parser.add_argument(
'--singularity-prefix',
Expand All @@ -220,14 +226,24 @@ def get_parser_args():
help='Conda prefix for environments'
)

parser.add_argument(
'--use-bundle',
default=False,
help='Download all references as single file'
)

args = parser.parse_args()

valid_commands = ['preprocess', 'find', 'simulate', 'hapmap', 'reference']
valid_commands = ['preprocess', 'find', 'simulate', 'hapmap', 'reference', 'bundle']
if args.command not in valid_commands:
raise RuntimeError(f'command {args.command} not in list of valid commands: {valid_commands}')

if args.impute and not args.phase:
raise ValueError('If --impute is present, then --phase must also be present')

if args.command != 'reference' and args.use_bundle:
raise ValueError('--bundle option only available for reference downloading')

return args


Expand Down Expand Up @@ -262,36 +278,48 @@ def copy_input(input_dir, working_dir, samples_file):

start_time = datetime.datetime.now()

# in case when launcher.py is executed outside the Snakemake dir
current_path = os.path.dirname(getsourcefile(lambda: 0))

if not os.path.exists(args.directory):
os.makedirs(args.directory)

if args.command == 'simulate':
copy_input('workflows/pedsim/params', args.directory, os.path.join('workflows/pedsim/', args.sim_samples_file))
copy_input(
os.path.join(current_path, 'workflows/pedsim/params'),
args.directory, os.path.join(current_path, 'workflows/pedsim/', args.sim_samples_file)
)
# for some reason launching with docker from command line
# sets root directory for 'configfile' directive in Snakefile as snakemake.workdir
# sets root directory for 'configfile' directive in bundle.Snakefile as snakemake.workdir
# therefore config.yaml must be in snakemake.workdir
shutil.copy('workflows/pedsim/config.yaml', os.path.join(args.directory, 'config.yaml'))
shutil.copy(
os.path.join(current_path, 'workflows/pedsim/config.yaml'),
os.path.join(args.directory, 'config.yaml')
)

if args.command == 'hapmap':
# for some reason launching with docker from command line
# sets root directory for 'configfile' directive in Snakefile as snakemake.workdir
# therefore config.yaml must be in snakemake.workdir
shutil.copy('workflows/hapmap/config.yaml', os.path.join(args.directory, 'config.yaml'))

shutil.copy(
os.path.join(current_path, 'workflows/hapmap/config.yaml'),
os.path.join(args.directory, 'config.yaml')
)

if args.command == 'preprocess':
shutil.copy(args.vcf_file, os.path.join(args.directory, 'input.vcf.gz'))

if args.command in ['preprocess', 'find', 'reference']:
if args.command in ['preprocess', 'find', 'reference', 'bundle']:
if args.directory != '.':
shutil.copy('config.yaml', os.path.join(args.directory, 'config.yaml'))
shutil.copy(os.path.join(current_path, 'config.yaml'), os.path.join(args.directory, 'config.yaml'))

snakefiles = {
'preprocess': 'workflows/preprocess2/Snakefile',
'find': 'Snakefile',
'find': 'workflows/find/Snakefile',
'simulate': 'workflows/pedsim/Snakefile',
'hapmap': 'workflows/hapmap/Snakefile',
'reference': 'workflows/reference/Snakefile'
'reference': 'workflows/reference/Snakefile',
'bundle': 'workflows/bundle/Snakefile'
}

if args.client:
Expand All @@ -300,7 +328,10 @@ def copy_input(input_dir, working_dir, samples_file):
raise RuntimeError(f'Background data is missing for the client mode')

if not args.snakefile:
snakefile = snakefiles[args.command]
if args.command == 'reference' and args.use_bundle:
snakefile = os.path.join(current_path, snakefiles['bundle'])
else:
snakefile = os.path.join(current_path, snakefiles[args.command])
else:
snakefile = args.snakefile

Expand All @@ -321,7 +352,7 @@ def copy_input(input_dir, working_dir, samples_file):
if args.flow not in ['germline', 'ibis', 'ibis_king']:
raise ValueError(f'--flow can be one of the ["germline", "ibis", "ibis_king"] and not {args.flow}')
config_dict['flow'] = args.flow
if args.command in ['preprocess', 'simulate', 'hapmap', 'reference']:
if args.command in ['preprocess', 'simulate', 'hapmap', 'reference', 'bundle']:
config_dict['remove_imputation'] = args.remove_imputation
config_dict['impute'] = args.impute
config_dict['phase'] = args.phase
Expand All @@ -334,7 +365,7 @@ def copy_input(input_dir, working_dir, samples_file):

if not snakemake.snakemake(
snakefile=snakefile,
#configfiles=[args.configfile],
configfiles=[args.configfile or 'config.yaml'],
config=config_dict,
workdir=args.directory,
cores=args.cores,
Expand All @@ -347,7 +378,7 @@ def copy_input(input_dir, working_dir, samples_file):
until=[args.until] if args.until is not None else [],
use_conda=True,
conda_prefix=args.conda_prefix,
use_singularity=True,
use_singularity=args.use_singularity,
singularity_prefix=args.singularity_prefix,
singularity_args=args.singularity_args,
envvars=['CONDA_ENVS_PATH', 'CONDA_PKGS_DIRS']
Expand Down
Loading

0 comments on commit 4baa699

Please sign in to comment.