Skip to content

Commit

Permalink
Merge pull request #436 from pcjentsch/fix-compressed-input
Browse files Browse the repository at this point in the history
add decompression of input files, fixes #432
  • Loading branch information
aineniamh committed Apr 20, 2022
2 parents 4f1d58d + 723cd28 commit 7008798
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 19 deletions.
2 changes: 1 addition & 1 deletion pangolin/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def main(sysargs = sys.argv[1:]):
parse_qc_thresholds(args.maxambig, args.minlen, config[KEY_REFERENCE_FASTA], config)
config[KEY_QUERY_FASTA] = io.find_query_file(cwd, config[KEY_TEMPDIR], args.query)

io.quick_check_query_file(cwd, args.query, config[KEY_QUERY_FASTA])
config[KEY_INPUT_COMPRESSION_TYPE] = io.quick_check_query_file(cwd, args.query, config[KEY_QUERY_FASTA])

if config[KEY_ANALYSIS_MODE] == "usher":
# Find usher protobuf file (and if specified, assignment cache file too)
Expand Down
45 changes: 28 additions & 17 deletions pangolin/scripts/preprocessing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,41 @@ rule align_to_reference:
fasta = config[KEY_QUERY_FASTA],
reference = config[KEY_REFERENCE_FASTA]
params:
compression_type = config[KEY_INPUT_COMPRESSION_TYPE],
trim_start = config[KEY_TRIM_START],
trim_end = config[KEY_TRIM_END],
sam = os.path.join(config[KEY_TEMPDIR],"mapped.sam")
output:
fasta = config[KEY_ALIGNMENT_FILE]
log:
os.path.join(config[KEY_TEMPDIR], "logs/minimap2_sam.log")
shell:
# the first line of this streams through the fasta and replaces '-' in sequences with empty strings
# this could be replaced by a python script later
# {{ gsub(" ","_",$0); }} {{ gsub(",","_",$0); }}
"""
awk '{{ if ($0 !~ /^>/) {{ gsub("-", "",$0); }} print $0; }}' "{input.fasta}" | \
awk '{{ {{ gsub(" ", "_",$0); }} {{ gsub(",", "_",$0); }} print $0; }}' | \
minimap2 -a -x asm20 --sam-hit-only --secondary=no --score-N=0 -t {workflow.cores} {input.reference:q} - -o {params.sam:q} &> {log:q}
gofasta sam toMultiAlign \
-s {params.sam:q} \
-t {workflow.cores} \
--reference {input.reference:q} \
--trimstart {params.trim_start} \
--trimend {params.trim_end} \
--trim \
--pad > '{output.fasta}'
"""
run:
# the first line of this streams through the fasta and replaces '-' in sequences with empty strings
# this could be replaced by a python script later
# {{ gsub(" ","_",$0); }} {{ gsub(",","_",$0); }}
shell_command = """ | awk '{{ if ($0 !~ /^>/) {{ gsub("-", "",$0); }} print $0; }}' | \
awk '{{ {{ gsub(" ", "_",$0); }} {{ gsub(",", "_",$0); }} print $0; }}' | \
minimap2 -a -x asm20 --sam-hit-only --secondary=no --score-N=0 -t {workflow.cores} {input.reference:q} - -o {params.sam:q} &> {log:q}
gofasta sam toMultiAlign \
-s {params.sam:q} \
-t {workflow.cores} \
--reference {input.reference:q} \
--trimstart {params.trim_start} \
--trimend {params.trim_end} \
--trim \
--pad > '{output.fasta}'
"""

#if input is plaintext, pipe in the contents, otherwise prepend with decompressor
if params.compression_type == "plaintext":
shell_command = "cat \"{input.fasta}\"" + shell_command
elif params.compression_type == "xz":
shell_command = "xz -c -d \"{input.fasta}\"" + shell_command
elif params.compression_type == "gz":
shell_command = "gzip -c -d \"{input.fasta}\"" + shell_command

shell(shell_command)


rule create_seq_hash:
input:
Expand Down
2 changes: 2 additions & 0 deletions pangolin/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
KEY_PLEARN_HEADER = "plearn_header"
KEY_DESIGNATION_CACHE="designation_cache"
KEY_ASSIGNMENT_CACHE = "assignment_cache"
KEY_INPUT_COMPRESSION_TYPE = "compression_type"


# Version KEYS

Expand Down
2 changes: 2 additions & 0 deletions pangolin/utils/initialising.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def setup_config_dict(cwd):
KEY_EXPANDED_LINEAGE: False,

KEY_CONSTELLATION_FILES: [],

KEY_INPUT_COMPRESSION_TYPE: "plaintext",

KEY_PANGOLIN_VERSION: __version__,
KEY_PANGOLIN_DATA_VERSION: pangolin_data.__version__,
Expand Down
6 changes: 5 additions & 1 deletion pangolin/utils/io_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,14 @@ def find_query_file(cwd, tempdir, query_arg):


def quick_check_query_file(cwd, query_arg, query):

input_compression_type = "plaintext"
if os.path.exists(os.path.join(cwd, query_arg[0])):
file_ending = query.split(".")[-1]
if file_ending in ["gz","gzip","tgz"]:
input_compression_type = "gz"
query = gzip.open(query, 'rt')
elif file_ending in ["xz","lzma"]:
input_compression_type = "xz"
query = lzma.open(query, 'rt')
try:
parse= True
Expand All @@ -70,6 +72,8 @@ def quick_check_query_file(cwd, query_arg, query):
if parse == False:
break
parse = False

return input_compression_type
except UnicodeDecodeError:
sys.stderr.write(cyan(
f'Error: the input query fasta could not be parsed.\n' +
Expand Down
3 changes: 3 additions & 0 deletions pangolin/utils/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from Bio import SeqIO
import csv
import gzip
import lzma


def create_seq_hash(seq_file,hash_map,hashed_seqs):
Expand Down Expand Up @@ -154,6 +155,8 @@ def merge_files(fasta, qc_status, scorpio_report, designated, hash_map, out_merg
file_ending = fasta.split(".")[-1]
if file_ending in ["gz","gzip","tgz"]:
query = gzip.open(fasta, 'rt')
elif file_ending in ["xz","lzma"]:
query = lzma.open(fasta, 'rt')
else:
query = open(fasta,"r")

Expand Down

0 comments on commit 7008798

Please sign in to comment.