Skip to content

Commit

Permalink
Merge pull request #18 from genxnetwork/test_new_ref
Browse files Browse the repository at this point in the history
Update reference downloading
  • Loading branch information
superbsky committed Aug 23, 2021
2 parents 4a882c8 + 3b44396 commit cfaa4cf
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 9 deletions.
7 changes: 7 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ reference:
md5: 45f81df94f0408d082363e34a081ed81
mirror:
- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz
GRCh37_fasta_fai:
file: human_g1k_v37.fasta.fai
url: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.fai
filesize: 2746
md5: 772484cc07983aba1355c7fb50f176d4
mirror:
- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.fai
GENETIC_MAP:
file: tables/genetic_map_hg19_withX.txt.gz
url: https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/tables/genetic_map_hg19_withX.txt.gz
Expand Down
4 changes: 3 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Main features:

The pipeline is implemented with the Snakemake framework. All used components are wrapped in Singularity containers or isolated in a Conda environment.

The visualisation of execution graph: [svg](https://bitbucket.org/genxglobal/genx-relatives-snakemake/raw/077f33cfdd421ae17b5c02a3a5f8eb34bd20e1fd/dag.svg).
The visualisation of execution graph: [svg](https://raw.githubusercontent.com/genxnetwork/grape/master/dag.svg).

Multi-core parallelization is highly utilized due to the ability to split input data by each sample/chromosome.

Expand Down Expand Up @@ -62,6 +62,8 @@ launcher.py reference --ref-directory /media/ref
```

If you want to run simulation after reference downloading you should add either `--impute` or `--phase` flag

4. (Optional) Compile Funnel from https://github.com/messwith/funnel with go 1.12+ and make. Then one can just use bin/funnel binary.
This Funnel fork simply adds the ‘--privileged’ flag to all task docker commands.
Without ‘--privileged’ singularity containers do not work inside docker.
Expand Down
33 changes: 25 additions & 8 deletions workflows/reference/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ configfile: "config.yaml"
# it's where they will be stored ~40G
REF_DIR = config["ref_dir"]
GRCH37_FASTA = join(REF_DIR, config["reference"]["GRCh37_fasta"]["file"])
GRCH37_FASTA_FAI = join(REF_DIR, config["reference"]["GRCh37_fasta_fai"]["file"])
GENETIC_MAP = join(REF_DIR, config["reference"]["GENETIC_MAP"]["file"])
GENETIC_MAP_GRCH37 = join(REF_DIR, config["reference"]["genetic_map_GRCh37"]["file"])
REF_VCF = join(REF_DIR, config["reference"]["vcfRef"]["file"])
Expand All @@ -27,6 +28,11 @@ GRCH37_FASTA_mirror = config["reference"]["GRCh37_fasta"]["mirror"]
GRCH37_FASTA_basename = basename(GRCH37_FASTA_url)
GRCH37_FASTA_md5 = config["reference"]["GRCh37_fasta"]["md5"]

GRCH37_FASTA_FAI_url = config["reference"]["GRCh37_fasta_fai"]["url"]
GRCH37_FASTA_FAI_mirror = config["reference"]["GRCh37_fasta_fai"]["mirror"]
GRCH37_FASTA_FAI_basename = basename(GRCH37_FASTA_FAI_url)
GRCH37_FASTA_FAI_md5 = config["reference"]["GRCh37_fasta_fai"]["md5"]

GENETIC_MAP_url = config["reference"]["GENETIC_MAP"]["url"]
GENETIC_MAP_mirror = config["reference"]["GENETIC_MAP"]["mirror"]
GENETIC_MAP_md5 = config["reference"]["GENETIC_MAP"]["md5"]
Expand Down Expand Up @@ -74,6 +80,7 @@ if need_phase or need_imputation:
input:
GRCh37_fasta = GRCH37_FASTA,
GRCh37_fasta_dict = expand("{ref}.dict", ref=GRCH37_FASTA),
GRCh37_fasta_fai = GRCH37_FASTA_FAI,
GENETIC_MAP = GENETIC_MAP,
genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
cmmap = expand(CMMAP, chrom=CHROMOSOMES),
Expand All @@ -89,6 +96,7 @@ else:
input:
GRCh37_fasta = GRCH37_FASTA,
GRCh37_fasta_dict = expand("{ref}.dict", ref=GRCH37_FASTA),
GRCh37_fasta_fai= GRCH37_FASTA_FAI,
GENETIC_MAP = GENETIC_MAP,
genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
cmmap = expand(CMMAP, chrom=CHROMOSOMES),
Expand All @@ -100,7 +108,8 @@ else:

rule download_GRCh37_fasta:
output:
GRCh37_fasta = GRCH37_FASTA
GRCh37_fasta = GRCH37_FASTA,
GRCh37_fasta_fai = GRCH37_FASTA_FAI
conda:
"../../envs/download.yaml"
log:
Expand All @@ -118,6 +127,13 @@ rule download_GRCh37_fasta:
fi
# gzip exit with "decompression OK, trailing garbage ignored". need to silence
gzip -d {REF_DIR}/human_g1k_v37.fasta.gz |& tee -a {log}
wget {GRCH37_FASTA_FAI_url} -P {REF_DIR} --tries 50 -c |& tee -a {log} || wget {GRCH37_FASTA_FAI_mirror} -P {REF_DIR} --tries 50 -c |& tee -a {log}
sum2=$(md5sum {REF_DIR}/{GRCH37_FASTA_FAI_basename} | cut -d " " -f1)
if [ $sum2 != {GRCH37_FASTA_FAI_md5} ]; then
echo "md5 sum of GRCH37_FAST_FAI is invalid" |& tee -a {log}
exit 1
fi
exit 0
"""

Expand Down Expand Up @@ -175,7 +191,7 @@ rule download_genetic_map_GRCh37:
fi
tar -zxvf {REF_DIR}/genetic_map_HapMapII_GRCh37.tar.gz -C {REF_DIR}/genetic_map_GRCh37 --exclude=README.txt |& tee -a {log}
rm {REF_DIR}/genetic_map_HapMapII_GRCh37.tar.gz |& tee -a {log}
# consider postprocessing? https://github.com/arq5x/gemini/blob/master/gemini/annotation_provenance/make-recombination.sh
# consider postprocessing? https://github.com/arq5x/gemini/blob/master/gemini/annotation_provenance/make-recombination.sh
"""

rule download_vcfRef:
Expand All @@ -191,14 +207,15 @@ rule download_vcfRef:
set -e
set -x
md5array=({REF_VCF_md5})
mkdir -p {REF_DIR}/1000genome/bcf/
for chrom in {{1..22}}; do
wget {REF_VCF_url} -P {REF_DIR}/1000genome/bcf/ -O 1000genome_chr$chrom_tmp.bcf --tries 50 -c |& tee -a {log}
sum=$(md5sum {REF_DIR}/1000genome/bcf/1000genome_chr$chrom_tmp.bcf | cut -d " " -f1)
wget {REF_VCF_url} -O {REF_DIR}/1000genome/bcf/1000genome_chr$chrom.tmp.bcf --tries 50 -c |& tee -a {log}
sum=$(md5sum {REF_DIR}/1000genome/bcf/1000genome_chr$chrom.tmp.bcf | cut -d " " -f1)
if [ $sum != ${{md5array[$chrom-1]}} ]; then
echo "md5 sum of vcfRef is invalid" |& tee -a {log}
exit 1
fi
bcftools annotate --set-id '%CHROM:%POS:%REF:%FIRST_ALT' {REF_DIR}/1000genome/bcf/1000genome_chr$chrom_tmp.bcf -O z -o -Ob -o {REF_DIR}/1000genome/bcf/1000genome_chr$chrom.bcf |& tee -a {log}
bcftools annotate --set-id '%CHROM:%POS:%REF:%FIRST_ALT' {REF_DIR}/1000genome/bcf/1000genome_chr$chrom.tmp.bcf -O z -o -Ob -o {REF_DIR}/1000genome/bcf/1000genome_chr$chrom.bcf |& tee -a {log}
bcftools index {REF_DIR}/1000genome/bcf/1000genome_chr$chrom.bcf |& tee -a {log}
done
"""
Expand Down Expand Up @@ -331,7 +348,7 @@ rule download_hd_genotype_chip:
if [ $sum != {HD_GENOTYPE_CHIP_md5} ]; then
echo "md5 sum of HD_GENOTYPE_CHIP is invalid" |& tee -a {log}
exit 1
fi
fi
wget {HD_GENOTYPE_CHIP_url}.tbi -O {HD_GENOTYPE_CHIP}.tbi --tries 50 -c |& tee -a {log}
"""

Expand All @@ -350,7 +367,7 @@ rule download_affymetrix_chip:
if [ $sum != {AFFYMETRIX_CHIP_md5} ]; then
echo "md5 sum of AFFYMETRIX_CHIP is invalid" |& tee -a {log}
exit 1
fi
fi
wget {AFFYMETRIX_CHIP_url}.tbi -O {AFFYMETRIX_CHIP}.tbi --tries 50 -c |& tee -a {log}
"""

Expand All @@ -368,7 +385,7 @@ rule download_pedsim_map:
if [ $sum != {PEDSIM_MAP_md5} ]; then
echo "md5 sum of PEDSIM_MAP is invalid" |& tee -a {log}
exit 1
fi
fi
tar xvzf {REF_DIR}/Refined_genetic_map_b37.tar.gz -C {REF_DIR} |& tee -a {log}
printf "#chr\tpos\tmale_cM\tfemale_cM\n" > {REF_DIR}/refined_mf.simmap
for chr in {{1..22}}; do
Expand Down

0 comments on commit cfaa4cf

Please sign in to comment.