Skip to content

Commit

Permalink
Merge pull request #1874 from cogent3/develop
Browse files Browse the repository at this point in the history
Merging changes on develop
  • Loading branch information
GavinHuttley committed May 23, 2024
2 parents 0dbf437 + 2f7c86a commit b832dd4
Show file tree
Hide file tree
Showing 14 changed files with 788 additions and 1,036 deletions.
156 changes: 89 additions & 67 deletions src/cogent3/core/annotation_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
import inspect
import io
import json
import os
import pathlib
import re
import sqlite3
import sys
import typing
Expand All @@ -19,7 +17,9 @@
import cogent3.util.warning as c3warn

from cogent3._version import __version__
from cogent3.parse.gff import merged_gff_records
from cogent3.util.deserialise import register_deserialiser
from cogent3.util.io import PathType, iter_line_blocks
from cogent3.util.misc import extend_docstring_from, get_object_provenance
from cogent3.util.progress_display import display_wrap
from cogent3.util.table import Table
Expand All @@ -28,6 +28,7 @@
OptionalInt = typing.Optional[int]
OptionalStr = typing.Optional[str]
OptionalStrList = typing.Optional[typing.Union[str, typing.List[str]]]
OptionalStrContainer = typing.Optional[typing.Union[str, typing.Sequence]]
OptionalBool = typing.Optional[bool]
OptionalDbCursor = typing.Optional[sqlite3.Cursor]
ReturnType = typing.Tuple[str, tuple] # the sql statement and corresponding values
Expand Down Expand Up @@ -1124,7 +1125,7 @@ def _update_db_from_other_db(
def to_json(self) -> str:
return json.dumps(self.to_rich_dict())

def write(self, path: os.PathLike) -> None:
def write(self, path: PathType) -> None:
"""writes db as bytes to path"""
backup = sqlite3.connect(path)
with self.db:
Expand All @@ -1139,7 +1140,7 @@ def write(self, path: os.PathLike) -> None:
def subset(
self,
*,
source: os.PathLike | str = ":memory:",
source: PathType = ":memory:",
biotype: str = None,
seqid: str = None,
name: str = None,
Expand Down Expand Up @@ -1177,6 +1178,24 @@ def subset(

return result

def close(self):
"""closes the db"""
self.db.close()

def _make_index(self, *, table_name: str, col_names: tuple[str, ...]):
"""index columns for faster search"""
sql = f"CREATE INDEX IF NOT EXISTS %s on {table_name}(%s)"
for col in col_names:
self._execute_sql(sql % (col, col))

def make_indexes(self):
"""adds db indexes for core attributes"""
for table_name in self.table_names:
self._make_index(
table_name=table_name,
col_names=("biotype", "seqid", "name", "start", "stop", "parent_id"),
)


class BasicAnnotationDb(SqliteAnnotationDbMixin):
"""Provides a user table for annotations. This can be merged with
Expand Down Expand Up @@ -1227,6 +1246,15 @@ def add_records(self, data: T) -> None:
self._execute_sql(cmnd=cmnd, values=vals)


def _merge_spans(old: numpy.ndarray, new: list[list[int]]) -> numpy.ndarray:
"""returns sorted, merged old and new spans"""
if len(new) == old.shape[0] and (old == new).all():
return old

new = numpy.array(sorted(new), dtype=old.dtype)
return numpy.unique(numpy.concatenate([old, new]), axis=0)


class GffAnnotationDb(SqliteAnnotationDbMixin):
"""Support for annotations from gff files. Records that span multiple
rows in the gff are merged into a single record."""
Expand Down Expand Up @@ -1262,7 +1290,7 @@ def __init__(
self._serialisable["source"] = self.source
self._db = None
self._setup_db(db)
data = self._merged_data(data)
data, self._num_fakeids = merged_gff_records(data, self._num_fakeids)
self.add_records(data)

def add_records(self, reduced: dict) -> None:
Expand Down Expand Up @@ -1291,51 +1319,27 @@ def add_records(self, reduced: dict) -> None:
self.db.commit()
del reduced

def _merged_data(self, records) -> dict:
field_template = r"(?<={}=)[^;\s]+"
name = re.compile(field_template.format("ID"))
parent_id = re.compile(field_template.format("Parent"))

reduced = collections.OrderedDict()
# collapse records with ID's occurring in multiple rows into one
# row, converting their coordinates
# extract the name from ID and add this into the table
# I am not convinced we can rely on gff files to be ordered,
# if we could, we could do this as one pass over the data
# all keys need to be lower case
# NOTE only records which have an ID field get merged into a single
# record.
for record in records:
record["biotype"] = record.pop("Type")
record["stop"] = record.pop("End")

# we force all keys that map to table column names to be lower case
for key in tuple(record):
if key.lower() not in self._gff_schema:
continue
record[key.lower()] = record.pop(key)

attrs = record["attributes"] or ""
if match := name.search(attrs):
record_id = match.group()
else:
record_id = f"unknown-{self._num_fakeids}"
self._num_fakeids += 1

record["name"] = record_id
if pid := parent_id.search(attrs):
record["parent_id"] = pid.group()
def update_record_spans(self, *, name: str, spans: list[list[int]]) -> None:
"""updates spans attribute of a gff table record if present
if record_id not in reduced:
reduced[record_id] = record
reduced[record_id]["spans"] = []
Notes
-----
Has no effect if name is not present.
"""
if not len(spans):
return

# should this just be an append?
reduced[record_id]["spans"].append((record["start"], record["stop"]))
result = self._execute_sql(
cmnd="SELECT spans from gff WHERE name = ?", values=(name,)
).fetchone()

del records
if result is None:
return

return reduced
old_spans = _merge_spans(result["spans"], spans)
self._execute_sql(
cmnd="UPDATE gff SET spans = ? WHERE name = ?", values=(old_spans, name)
)


# The GenBank format is less clear on the relationship between identifiers,
Expand Down Expand Up @@ -1608,7 +1612,9 @@ def convert_annotation_to_annotation_db(data: dict) -> SupportsFeatures:


@display_wrap
def _db_from_genbank(path: os.PathLike, db: SupportsFeatures, write_path, **kwargs):
def _db_from_genbank(
path: PathType, db: typing.Optional[SupportsFeatures], write_path, **kwargs
):
from cogent3 import open_
from cogent3.parse.genbank import MinimalGenbankParser

Expand All @@ -1630,52 +1636,64 @@ def _db_from_genbank(path: os.PathLike, db: SupportsFeatures, write_path, **kwar

if not one_valid_path:
raise IOError(f"{str(path)!r} not found")

db.make_indexes()
return db


def _leave_attributes(*attrs):
return attrs[0]


OptionalStrContainer = typing.Optional[typing.Union[str, typing.Sequence]]


@display_wrap
def _db_from_gff(
path: os.PathLike,
path: PathType,
seqids: OptionalStrContainer,
db: SupportsFeatures,
write_path,
db: typing.Optional[SupportsFeatures],
write_path: PathType,
num_lines: OptionalInt,
**kwargs,
):
from cogent3.parse.gff import gff_parser
) -> SupportsFeatures:
from cogent3.parse.gff import gff_parser, is_gff3

paths = pathlib.Path(path)
paths = list(paths.parent.glob(paths.name))

ui = kwargs.pop("ui")
one_valid_path = False
seen_ids = set()
for path in ui.series(paths):
data = list(
gff_parser(
path,
seqids=seqids,
attribute_parser=_leave_attributes,
num_fake_ids = 0
gff3 = is_gff3(path)
db = GffAnnotationDb(source=write_path, db=db)
for block in iter_line_blocks(path, num_lines=num_lines):
data = list(
gff_parser(
block, seqids=seqids, attribute_parser=_leave_attributes, gff3=gff3
)
)
)
db = GffAnnotationDb(source=write_path, data=data, db=db)
data, num_fake_ids = merged_gff_records(data, num_fake_ids)
if already_seen := seen_ids & data.keys():
for name in already_seen:
db.update_record_spans(name=name, spans=data[name].spans)

seen_ids |= data.keys()
db.add_records(data)
one_valid_path = True
if not one_valid_path:
raise IOError(f"{str(path)!r} not found")

db.make_indexes()
return db


def load_annotations(
*,
path: os.PathLike,
path: PathType,
seqids: OptionalStr = None,
db: SupportsFeatures = None,
write_path: os.PathLike = ":memory:",
db: typing.Optional[SupportsFeatures] = None,
write_path: PathType = ":memory:",
lines_per_block: OptionalInt = 500_000,
show_progress: bool = False,
) -> SupportsFeatures:
"""loads annotations from flatfile into a db
Expand All @@ -1693,6 +1711,9 @@ def load_annotations(
write_path
where the constructed database should be written, defaults to
memory only
lines_per_block
number of lines to insert into the db per iteration. This can help with
memory usage. Only applies to gff files.
show_progress
applied only if loading features from multiple files
Expand All @@ -1714,6 +1735,7 @@ def load_annotations(
db=db,
write_path=write_path,
show_progress=show_progress,
num_lines=lines_per_block,
)
)

Expand Down Expand Up @@ -1743,7 +1765,7 @@ def _update_array_format(data: bytes) -> bytes:


def update_file_format(
source_path: os.PathLike,
source_path: PathType,
db_class: typing.Union[
type[BasicAnnotationDb],
type[GenbankAnnotationDb],
Expand All @@ -1768,7 +1790,7 @@ def update_file_format(
Parameters
----------
source_path : os.PathLike
source_path : PathType
The database file to reformat.
db_class : typing.Union[ type[BasicAnnotationDb], type[GenbankAnnotationDb], type[GffAnnotationDb], ]
The type of database the file is.
Expand Down
17 changes: 16 additions & 1 deletion src/cogent3/core/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,22 @@ def _update_lengths(

@dataclasses.dataclass
class IndelMap(MapABC):
"""store locations of deletions in a Aligned sequence"""
"""store locations of deletions in a Aligned sequence
Parameters
----------
gap_pos
start positions of gap segments in sequence (ungapped) coordinates
cum_gap_lengths
cumulative gap lengths per segment
gap_lengths
length of each gap segment
termini_unknown
if ``True``, returns new instance with terminal gaps indicated as
unknown character '?'
parent_length
length of parent sequence (i.e. aligned sequence with gaps)
"""

# gap data is gap positions, gap lengths on input, stored
gap_pos: IntArrayTypes
Expand Down
21 changes: 20 additions & 1 deletion src/cogent3/core/moltype.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,9 @@ def __init__(
for i in self._items:
i._complement_array = _make_complement_array(i, comps)

def iter_alphabets(self):
yield from self._items


# colours for HTML representation

Expand Down Expand Up @@ -516,7 +519,7 @@ def _do_nothing(x):
return x


class MolType(object):
class MolType:
"""MolType: Handles operations that depend on the sequence type (e.g. DNA).
The MolType knows how to connect alphabets, sequences, alignments, and so
Expand Down Expand Up @@ -1357,6 +1360,22 @@ def resolve_ambiguity(

return result

def is_compatible_alphabet(self, alphabet: Alphabet, strict: bool = True) -> bool:
"""checks that characters in alphabet are equal to a bound alphabet
Parameters
----------
alphabet
an Alphabet instance
strict
the order of elements must match
"""
if not strict:
query = set(alphabet)
return any(set(alpha) == query for alpha in self.alphabets.iter_alphabets())

return any(alpha == alphabet for alpha in self.alphabets.iter_alphabets())


def _convert_to_rna(seq: str) -> str:
return seq.replace("t", "u").replace("T", "U")
Expand Down
Loading

0 comments on commit b832dd4

Please sign in to comment.