Merge branch 'MatthiasvdBelt/master'

gamcil · Feb 16, 2023 · dd10442 · dd10442
2 parents 997c0c3 + 0d11818
commit dd10442
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 45 deletions.
diff --git a/cblaster/context.py b/cblaster/context.py
@@ -493,9 +493,9 @@ def filter_session(
                 scaffold.subjects,
                 queries=session.queries,
                 gap=gap,
-                min_hits=min_hits,
+                min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
                 require=require,
-                unique=unique,
+                unique=len(session.queries) if len(session.queries) < unique else unique,
                 percentage=percentage,
             )
             if len(scaffold.subjects) == 0:  # indicates no hits in clusters and we should not attempt to call scaffold.add_clusters as this would fail

diff --git a/cblaster/database.py b/cblaster/database.py
@@ -59,14 +59,19 @@ def seqrecords_to_sqlite(tuples, database):
         LOG.exception("Failed to insert %i records", len(tuples))
 
 
-def sqlite_to_fasta(path, database):
+def sqlite_to_fasta(path, database, compress):
     """Writes all proteins in `database` to `path` in FASTA format.
 
     Args:
         path (str): Path to output FASTA file
         database (str): Path to SQLite3 database
     """
-    with SQLITE.connect(str(database)) as con, gzip.open(path, "wt") as fasta:
+    if compress:
+        handler = gzip.open(path, "wt")
+    else:
+        handler = open(path, 'w')
+
+    with SQLITE.connect(str(database)) as con, handler  as fasta:
         cur = con.cursor()
         for (record,) in cur.execute(sql.FASTA):
             fasta.write(record)
@@ -140,7 +145,7 @@ def diamond_makedb(fasta, name, cpus):
     )
 
 
-def makedb(paths, database, force=False, cpus=None, batch=None):
+def makedb(paths, database, force=False, cpus=None, batch=None, compress=False):
     """makedb module entry point.
 
     Will parse genome files in `paths` and create:
@@ -173,9 +178,15 @@ def makedb(paths, database, force=False, cpus=None, batch=None):
         raise TypeError("cpus should be None or int")
 
     sqlite_path = Path(f"{database}.sqlite3")
-    fasta_path = Path(f"{database}.fasta.gz")
     dmnd_path = Path(f"{database}.dmnd")
 
+    if compress:
+        fasta_ext = '.fasta.gz'
+    else:
+        fasta_ext = '.fasta'
+
+    fasta_path = Path(f"{database}{fasta_ext}")
+
     if sqlite_path.exists() or dmnd_path.exists():
         if force:
             LOG.info("Pre-existing files found, overwriting")
@@ -222,7 +233,7 @@ def makedb(paths, database, force=False, cpus=None, batch=None):
         LOG.error("File parsing failed, exiting...", exc_info=True)
 
     LOG.info("Writing FASTA to %s", fasta_path)
-    sqlite_to_fasta(fasta_path, sqlite_path)
+    sqlite_to_fasta(fasta_path, sqlite_path, compress)
 
     LOG.info("Building DIAMOND database at %s", dmnd_path)
     diamond_makedb(fasta_path, dmnd_path, cpus)

diff --git a/cblaster/hmm_search.py b/cblaster/hmm_search.py
@@ -8,11 +8,12 @@
 import subprocess
 import logging
 import re
+import tempfile
 
 from datetime import datetime
 from ftplib import FTP
 from pathlib import Path
-from typing import Union, List, Collection, Set, Tuple
+from typing import Union, List, Collection, Set, Tuple, IO
 
 from Bio import SearchIO
 from cblaster.classes import Hit, Session
@@ -27,7 +28,11 @@ def check_pfam_db(path):
     Args:
         path: String, path where to check
     """
-    path = Path(path)
+
+    if not path:
+        path = Path.cwd()
+    else:
+        path = Path(path)
 
     if path.exists() and not path.is_dir():
         raise FileExistsError("Expected directory")
@@ -37,6 +42,7 @@ def check_pfam_db(path):
 
     hmm = path / "Pfam-A.hmm.gz"
     dat = path / "Pfam-A.hmm.dat.gz"
+    version = path / "Pfam.version.gz"
 
     if hmm.exists() and dat.exists():
         LOG.info("Pfam database found")
@@ -47,6 +53,7 @@ def check_pfam_db(path):
             ftp.cwd("pub/databases/Pfam/current_release")
             ftp.retrbinary(f"RETR {hmm.name}", hmm.open("wb").write)
             ftp.retrbinary(f"RETR {dat.name}", dat.open("wb").write)
+            ftp.retrbinary(f"RETR {version.name}", version.open("wb").write)
 
     return hmm, dat
 
@@ -66,16 +73,18 @@ def get_pfam_accession(
     Return:
         key_lines: List, string of full acc-number
     """
-    keys = set(keys)
-    valid_keys = set()
+    valid = set()
+    invalid = set(keys)
     name_attrs = ("#=GF ID", "#=GF AC")
     for line in gzip.open(dat_path, "rt"):
         if not line.startswith(name_attrs):
             continue
         *_, accession = line.strip().split(" ")
-        if any(key in accession for key in keys if key not in valid_keys):
-            valid_keys.add(accession)
-    return valid_keys, keys.difference(valid_keys)
+        for key in keys:
+            if key in accession:
+                valid.add(accession)
+                invalid.remove(key)
+    return valid, invalid
 
 
 def read_profiles(files: Collection[str]) -> Collection[str]:
@@ -120,24 +129,6 @@ def fetch_pfam_profiles(hmm, keys):
     return profiles
 
 
-def write_profiles(profiles: Collection[str], output: str=None) -> str:
-    """Writes a collection of profile HMMs to disk.
-
-    If no output file is specified, will randomly generate a file name and save
-    in the current working directory.
-
-    Args:
-        profiles: profile HMMs to write
-        output: name of output file
-    """
-    if not output:
-        output = datetime.now().strftime("cblaster_%Y%m%d%H%M%S.hmm")
-    with open(output, "w") as fp:
-        for profile in profiles:
-            fp.write(profile)
-    return output
-
-
 def run_hmmsearch(fasta, query):
     """Run the hmmsearch command
 
@@ -150,6 +141,9 @@ def run_hmmsearch(fasta, query):
     """
     LOG.info("Performing hmmsearch")
     output = Path(query).with_suffix(".txt")
+    # informat = "--informat fasta " if fasta.endswith("gz") else ""
+    # for unzipping the fasta file to be used as input for hmmsearch
+
     try:
         subprocess.run(
             f"hmmsearch -o {output} {query} {fasta}",
@@ -182,8 +176,8 @@ def parse_hmmer_output(results):
             hit_class = Hit(
                 query=record.id,  # Pfam id
                 subject=hit.id,  # Hit id
-                identity=None,  # Not present
-                coverage=None,  # Not present
+                identity=0,  # Not present
+                coverage=0,  # Not present
                 evalue=hit.evalue,  # E-value of hit
                 bitscore=hit.bitscore,  # Bit score of hit
             )
@@ -215,6 +209,7 @@ def perform_hmmer(
     query_profiles: List[str],
     pfam: str,
     session: Session,
+    hmm_out: str=None
 ) -> Union[Collection[Hit], None]:
     """Main of running a hmmer search
 
@@ -252,14 +247,18 @@ def perform_hmmer(
     if not profiles:
         LOG.error("No valid profiles could be selected")
         return
-    query = write_profiles(profiles)
-    LOG.info("Profiles written to: %s", query)
 
     # Save query profile HMM names
     session.queries = get_profile_names(profiles)
 
-    # Run search
-    results = run_hmmsearch(fasta, query)
+    # Write profiles to file
+    with open(hmm_out, 'w+b') if hmm_out else tempfile.NamedTemporaryFile() as fp:
+        for profile in profiles:
+            fp.write(profile.encode())
+        LOG.info("Profiles written to: %s", fp.name)
+
+        # Run search
+        results = run_hmmsearch(fasta, fp.name)
 
     # Parse results and return
     return parse_hmmer_output(results)
diff --git a/cblaster/main.py b/cblaster/main.py
@@ -263,8 +263,8 @@ def cblaster(
             organisms = context.search(
                 results,
                 sqlite_db=sqlite_db,
-                unique=unique,
-                min_hits=min_hits,
+                unique=len(session.queries) if len(session.queries) < unique else unique,
+                min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
                 gap=gap,
                 require=require,
                 ipg_file=ipg_file,
@@ -300,8 +300,8 @@ def cblaster(
             organisms = context.search(
                 results,
                 sqlite_db=sqlite_db,
-                unique=unique,
-                min_hits=min_hits,
+                unique=len(session.queries) if len(session.queries) < unique else unique,
+                min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
                 gap=gap,
                 require=require,
                 ipg_file=ipg_file,
@@ -333,8 +333,8 @@ def cblaster(
             LOG.info("Fetching genomic context of hits")
             organisms = context.search(
                 results,
-                unique=unique,
-                min_hits=min_hits,
+                unique=len(session.queries) if len(session.queries) < unique else unique,
+                min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
                 gap=gap,
                 require=require,
                 ipg_file=ipg_file,

diff --git a/cblaster/remote.py b/cblaster/remote.py
@@ -122,7 +122,19 @@ def start(
     LOG.debug("Search parameters: %s", parameters)
     LOG.debug("Search URL: %s", response.url)
 
-    rid, rtoe = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text)
+    matches = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text)
+
+    if len(matches) == 2:
+        rid, rtoe = matches
+    else:
+        LOG.exception('Unable to parse NCBI response')
+        LOG.info('NCBI response:')
+        LOG.info('---------')
+        LOG.info(response.text)
+        LOG.info('---------')
+
+        raise IOError('Unable to parse NCBI response')
+
     return rid, int(rtoe)