Skip to content

Commit

Permalink
Merge branch 'MatthiasvdBelt/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
gamcil committed Feb 16, 2023
2 parents 997c0c3 + 0d11818 commit dd10442
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 45 deletions.
4 changes: 2 additions & 2 deletions cblaster/context.py
Expand Up @@ -493,9 +493,9 @@ def filter_session(
scaffold.subjects,
queries=session.queries,
gap=gap,
min_hits=min_hits,
min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
require=require,
unique=unique,
unique=len(session.queries) if len(session.queries) < unique else unique,
percentage=percentage,
)
if len(scaffold.subjects) == 0: # indicates no hits in clusters and we should not attempt to call scaffold.add_clusters as this would fail
Expand Down
21 changes: 16 additions & 5 deletions cblaster/database.py
Expand Up @@ -59,14 +59,19 @@ def seqrecords_to_sqlite(tuples, database):
LOG.exception("Failed to insert %i records", len(tuples))


def sqlite_to_fasta(path, database):
def sqlite_to_fasta(path, database, compress):
"""Writes all proteins in `database` to `path` in FASTA format.
Args:
path (str): Path to output FASTA file
database (str): Path to SQLite3 database
"""
with SQLITE.connect(str(database)) as con, gzip.open(path, "wt") as fasta:
if compress:
handler = gzip.open(path, "wt")
else:
handler = open(path, 'w')

with SQLITE.connect(str(database)) as con, handler as fasta:
cur = con.cursor()
for (record,) in cur.execute(sql.FASTA):
fasta.write(record)
Expand Down Expand Up @@ -140,7 +145,7 @@ def diamond_makedb(fasta, name, cpus):
)


def makedb(paths, database, force=False, cpus=None, batch=None):
def makedb(paths, database, force=False, cpus=None, batch=None, compress=False):
"""makedb module entry point.
Will parse genome files in `paths` and create:
Expand Down Expand Up @@ -173,9 +178,15 @@ def makedb(paths, database, force=False, cpus=None, batch=None):
raise TypeError("cpus should be None or int")

sqlite_path = Path(f"{database}.sqlite3")
fasta_path = Path(f"{database}.fasta.gz")
dmnd_path = Path(f"{database}.dmnd")

if compress:
fasta_ext = '.fasta.gz'
else:
fasta_ext = '.fasta'

fasta_path = Path(f"{database}{fasta_ext}")

if sqlite_path.exists() or dmnd_path.exists():
if force:
LOG.info("Pre-existing files found, overwriting")
Expand Down Expand Up @@ -222,7 +233,7 @@ def makedb(paths, database, force=False, cpus=None, batch=None):
LOG.error("File parsing failed, exiting...", exc_info=True)

LOG.info("Writing FASTA to %s", fasta_path)
sqlite_to_fasta(fasta_path, sqlite_path)
sqlite_to_fasta(fasta_path, sqlite_path, compress)

LOG.info("Building DIAMOND database at %s", dmnd_path)
diamond_makedb(fasta_path, dmnd_path, cpus)
Expand Down
61 changes: 30 additions & 31 deletions cblaster/hmm_search.py
Expand Up @@ -8,11 +8,12 @@
import subprocess
import logging
import re
import tempfile

from datetime import datetime
from ftplib import FTP
from pathlib import Path
from typing import Union, List, Collection, Set, Tuple
from typing import Union, List, Collection, Set, Tuple, IO

from Bio import SearchIO
from cblaster.classes import Hit, Session
Expand All @@ -27,7 +28,11 @@ def check_pfam_db(path):
Args:
path: String, path where to check
"""
path = Path(path)

if not path:
path = Path.cwd()
else:
path = Path(path)

if path.exists() and not path.is_dir():
raise FileExistsError("Expected directory")
Expand All @@ -37,6 +42,7 @@ def check_pfam_db(path):

hmm = path / "Pfam-A.hmm.gz"
dat = path / "Pfam-A.hmm.dat.gz"
version = path / "Pfam.version.gz"

if hmm.exists() and dat.exists():
LOG.info("Pfam database found")
Expand All @@ -47,6 +53,7 @@ def check_pfam_db(path):
ftp.cwd("pub/databases/Pfam/current_release")
ftp.retrbinary(f"RETR {hmm.name}", hmm.open("wb").write)
ftp.retrbinary(f"RETR {dat.name}", dat.open("wb").write)
ftp.retrbinary(f"RETR {version.name}", version.open("wb").write)

return hmm, dat

Expand All @@ -66,16 +73,18 @@ def get_pfam_accession(
Return:
key_lines: List, string of full acc-number
"""
keys = set(keys)
valid_keys = set()
valid = set()
invalid = set(keys)
name_attrs = ("#=GF ID", "#=GF AC")
for line in gzip.open(dat_path, "rt"):
if not line.startswith(name_attrs):
continue
*_, accession = line.strip().split(" ")
if any(key in accession for key in keys if key not in valid_keys):
valid_keys.add(accession)
return valid_keys, keys.difference(valid_keys)
for key in keys:
if key in accession:
valid.add(accession)
invalid.remove(key)
return valid, invalid


def read_profiles(files: Collection[str]) -> Collection[str]:
Expand Down Expand Up @@ -120,24 +129,6 @@ def fetch_pfam_profiles(hmm, keys):
return profiles


def write_profiles(profiles: Collection[str], output: str=None) -> str:
"""Writes a collection of profile HMMs to disk.
If no output file is specified, will randomly generate a file name and save
in the current working directory.
Args:
profiles: profile HMMs to write
output: name of output file
"""
if not output:
output = datetime.now().strftime("cblaster_%Y%m%d%H%M%S.hmm")
with open(output, "w") as fp:
for profile in profiles:
fp.write(profile)
return output


def run_hmmsearch(fasta, query):
"""Run the hmmsearch command
Expand All @@ -150,6 +141,9 @@ def run_hmmsearch(fasta, query):
"""
LOG.info("Performing hmmsearch")
output = Path(query).with_suffix(".txt")
# informat = "--informat fasta " if fasta.endswith("gz") else ""
# for unzipping the fasta file to be used as input for hmmsearch

try:
subprocess.run(
f"hmmsearch -o {output} {query} {fasta}",
Expand Down Expand Up @@ -182,8 +176,8 @@ def parse_hmmer_output(results):
hit_class = Hit(
query=record.id, # Pfam id
subject=hit.id, # Hit id
identity=None, # Not present
coverage=None, # Not present
identity=0, # Not present
coverage=0, # Not present
evalue=hit.evalue, # E-value of hit
bitscore=hit.bitscore, # Bit score of hit
)
Expand Down Expand Up @@ -215,6 +209,7 @@ def perform_hmmer(
query_profiles: List[str],
pfam: str,
session: Session,
hmm_out: str=None
) -> Union[Collection[Hit], None]:
"""Main of running a hmmer search
Expand Down Expand Up @@ -252,14 +247,18 @@ def perform_hmmer(
if not profiles:
LOG.error("No valid profiles could be selected")
return
query = write_profiles(profiles)
LOG.info("Profiles written to: %s", query)

# Save query profile HMM names
session.queries = get_profile_names(profiles)

# Run search
results = run_hmmsearch(fasta, query)
# Write profiles to file
with open(hmm_out, 'w+b') if hmm_out else tempfile.NamedTemporaryFile() as fp:
for profile in profiles:
fp.write(profile.encode())
LOG.info("Profiles written to: %s", fp.name)

# Run search
results = run_hmmsearch(fasta, fp.name)

# Parse results and return
return parse_hmmer_output(results)
12 changes: 6 additions & 6 deletions cblaster/main.py
Expand Up @@ -263,8 +263,8 @@ def cblaster(
organisms = context.search(
results,
sqlite_db=sqlite_db,
unique=unique,
min_hits=min_hits,
unique=len(session.queries) if len(session.queries) < unique else unique,
min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
gap=gap,
require=require,
ipg_file=ipg_file,
Expand Down Expand Up @@ -300,8 +300,8 @@ def cblaster(
organisms = context.search(
results,
sqlite_db=sqlite_db,
unique=unique,
min_hits=min_hits,
unique=len(session.queries) if len(session.queries) < unique else unique,
min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
gap=gap,
require=require,
ipg_file=ipg_file,
Expand Down Expand Up @@ -333,8 +333,8 @@ def cblaster(
LOG.info("Fetching genomic context of hits")
organisms = context.search(
results,
unique=unique,
min_hits=min_hits,
unique=len(session.queries) if len(session.queries) < unique else unique,
min_hits=len(session.queries) if len(session.queries) < min_hits else min_hits,
gap=gap,
require=require,
ipg_file=ipg_file,
Expand Down
14 changes: 13 additions & 1 deletion cblaster/remote.py
Expand Up @@ -122,7 +122,19 @@ def start(
LOG.debug("Search parameters: %s", parameters)
LOG.debug("Search URL: %s", response.url)

rid, rtoe = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text)
matches = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text)

if len(matches) == 2:
rid, rtoe = matches
else:
LOG.exception('Unable to parse NCBI response')
LOG.info('NCBI response:')
LOG.info('---------')
LOG.info(response.text)
LOG.info('---------')

raise IOError('Unable to parse NCBI response')

return rid, int(rtoe)


Expand Down

0 comments on commit dd10442

Please sign in to comment.