Merge pull request #436 from pcjentsch/fix-compressed-input

add decompression of input files, fixes #432
cov-lineages · Apr 20, 2022 · 7008798 · 7008798
2 parents 4f1d58d + 723cd28
commit 7008798
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 19 deletions.
diff --git a/pangolin/command.py b/pangolin/command.py
@@ -169,7 +169,7 @@ def main(sysargs = sys.argv[1:]):
     parse_qc_thresholds(args.maxambig, args.minlen, config[KEY_REFERENCE_FASTA], config)
     config[KEY_QUERY_FASTA] = io.find_query_file(cwd, config[KEY_TEMPDIR], args.query)
 
-    io.quick_check_query_file(cwd, args.query, config[KEY_QUERY_FASTA])
+    config[KEY_INPUT_COMPRESSION_TYPE] = io.quick_check_query_file(cwd, args.query, config[KEY_QUERY_FASTA])
 
     if config[KEY_ANALYSIS_MODE] == "usher":
         # Find usher protobuf file (and if specified, assignment cache file too)

diff --git a/pangolin/scripts/preprocessing.smk b/pangolin/scripts/preprocessing.smk
@@ -18,30 +18,41 @@ rule align_to_reference:
         fasta = config[KEY_QUERY_FASTA],
         reference = config[KEY_REFERENCE_FASTA]
     params:
+        compression_type = config[KEY_INPUT_COMPRESSION_TYPE],
         trim_start = config[KEY_TRIM_START],
         trim_end = config[KEY_TRIM_END],
         sam = os.path.join(config[KEY_TEMPDIR],"mapped.sam")
     output:
         fasta = config[KEY_ALIGNMENT_FILE]
     log:
         os.path.join(config[KEY_TEMPDIR], "logs/minimap2_sam.log")
-    shell:
-    # the first line of this streams through the fasta and replaces '-' in sequences with empty strings
-    # this could be replaced by a python script later
-    #  {{ gsub(" ","_",$0); }} {{ gsub(",","_",$0); }}
-        """
-        awk '{{ if ($0 !~ /^>/) {{ gsub("-", "",$0); }} print $0; }}' "{input.fasta}" | \
-        awk '{{ {{ gsub(" ", "_",$0); }} {{ gsub(",", "_",$0); }} print $0; }}'  | \
-        minimap2 -a -x asm20 --sam-hit-only --secondary=no --score-N=0  -t  {workflow.cores} {input.reference:q} - -o {params.sam:q} &> {log:q} 
-        gofasta sam toMultiAlign \
-            -s {params.sam:q} \
-            -t {workflow.cores} \
-            --reference {input.reference:q} \
-            --trimstart {params.trim_start} \
-            --trimend {params.trim_end} \
-            --trim \
-            --pad > '{output.fasta}'
-        """
+    run:
+        # the first line of this streams through the fasta and replaces '-' in sequences with empty strings
+        # this could be replaced by a python script later
+        #  {{ gsub(" ","_",$0); }} {{ gsub(",","_",$0); }}
+        shell_command =  """ | awk '{{ if ($0 !~ /^>/) {{ gsub("-", "",$0); }} print $0; }}'   | \
+            awk '{{ {{ gsub(" ", "_",$0); }} {{ gsub(",", "_",$0); }} print $0; }}'  | \
+            minimap2 -a -x asm20 --sam-hit-only --secondary=no --score-N=0  -t  {workflow.cores} {input.reference:q} - -o {params.sam:q} &> {log:q} 
+            gofasta sam toMultiAlign \
+                -s {params.sam:q} \
+                -t {workflow.cores} \
+                --reference {input.reference:q} \
+                --trimstart {params.trim_start} \
+                --trimend {params.trim_end} \
+                --trim \
+                --pad > '{output.fasta}'
+            """
+
+        #if input is plaintext, pipe in the contents, otherwise prepend with decompressor
+        if params.compression_type == "plaintext":
+            shell_command = "cat \"{input.fasta}\"" + shell_command
+        elif params.compression_type == "xz":
+            shell_command = "xz -c -d \"{input.fasta}\"" + shell_command
+        elif params.compression_type == "gz":
+            shell_command = "gzip -c -d \"{input.fasta}\"" + shell_command
+
+        shell(shell_command)
+
 
 rule create_seq_hash:
     input:

diff --git a/pangolin/utils/config.py b/pangolin/utils/config.py
@@ -31,6 +31,8 @@
 KEY_PLEARN_HEADER = "plearn_header"
 KEY_DESIGNATION_CACHE="designation_cache"
 KEY_ASSIGNMENT_CACHE = "assignment_cache"
+KEY_INPUT_COMPRESSION_TYPE = "compression_type"
+
 
 # Version KEYS
 

diff --git a/pangolin/utils/initialising.py b/pangolin/utils/initialising.py
@@ -45,6 +45,8 @@ def setup_config_dict(cwd):
             KEY_EXPANDED_LINEAGE: False,
 
             KEY_CONSTELLATION_FILES: [],
+
+            KEY_INPUT_COMPRESSION_TYPE: "plaintext",
 
             KEY_PANGOLIN_VERSION: __version__,
             KEY_PANGOLIN_DATA_VERSION: pangolin_data.__version__,

diff --git a/pangolin/utils/io_parsing.py b/pangolin/utils/io_parsing.py
@@ -55,12 +55,14 @@ def find_query_file(cwd, tempdir, query_arg):
 
 
 def quick_check_query_file(cwd, query_arg, query):
-
+    input_compression_type = "plaintext"
     if os.path.exists(os.path.join(cwd, query_arg[0])):
         file_ending = query.split(".")[-1]
         if file_ending in ["gz","gzip","tgz"]:
+            input_compression_type = "gz"
             query = gzip.open(query, 'rt')
         elif file_ending in ["xz","lzma"]:
+            input_compression_type = "xz"
             query = lzma.open(query, 'rt')
     try:
         parse= True
@@ -70,6 +72,8 @@ def quick_check_query_file(cwd, query_arg, query):
             if parse == False:
                 break
             parse = False
+
+        return input_compression_type
     except UnicodeDecodeError:
         sys.stderr.write(cyan(
             f'Error: the input query fasta could not be parsed.\n' +

diff --git a/pangolin/utils/preprocessing.py b/pangolin/utils/preprocessing.py
@@ -5,6 +5,7 @@
 from Bio import SeqIO
 import csv
 import gzip
+import lzma
 
 
 def create_seq_hash(seq_file,hash_map,hashed_seqs):
@@ -154,6 +155,8 @@ def merge_files(fasta, qc_status, scorpio_report, designated, hash_map, out_merg
         file_ending = fasta.split(".")[-1]
         if file_ending in ["gz","gzip","tgz"]:
             query = gzip.open(fasta, 'rt')
+        elif file_ending in ["xz","lzma"]:
+            query = lzma.open(fasta, 'rt')
         else:
             query = open(fasta,"r")