From 6d25eb72ee221381d2bca898ba9e5d674de93585 Mon Sep 17 00:00:00 2001 From: Giorgio Gonnella Date: Wed, 5 Apr 2017 16:15:39 +0200 Subject: [PATCH] further cross-validations and errmsg --- gfapy/gfa.py | 11 +++++-- gfapy/line/edge/gfa2/gfa2.py | 3 +- gfapy/line/edge/gfa2/validation.py | 22 +++++++++++++ gfapy/line/fragment/fragment.py | 3 +- gfapy/line/fragment/validation.py | 21 ++++++++++++ gfapy/line/segment/length_gfa1.py | 1 + .../testdata/invalid/edge_wrong_lastpos.gfa2 | 12 +++++++ .../invalid/fragment_wrong_lastpos.gfa2 | 33 +++++++++++++++++++ .../testdata/invalid/inconsistent_length.gfa1 | 12 +++++++ 9 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 gfapy/line/edge/gfa2/validation.py create mode 100644 gfapy/line/fragment/validation.py create mode 100644 tests/testdata/invalid/edge_wrong_lastpos.gfa2 create mode 100644 tests/testdata/invalid/fragment_wrong_lastpos.gfa2 create mode 100644 tests/testdata/invalid/inconsistent_length.gfa1 diff --git a/gfapy/gfa.py b/gfapy/gfa.py index 3cfd60a..b7eb7a5 100644 --- a/gfapy/gfa.py +++ b/gfapy/gfa.py @@ -101,7 +101,8 @@ def validate(self): """ self.__validate_segment_references() self.__validate_path_links() - self.__validate_group_references() + self.__validate_group_items() + self.__validate_gfa2_positions() def __str__(self): return "\n".join([str(line) for line in self.lines]) @@ -262,7 +263,7 @@ def __validate_path_links(self): "does not exist, but is required by the following paths:\n"+ l.refstr()) - def __validate_group_references(self): + def __validate_group_items(self): if self.version == "gfa1": return for group in self.sets + self.paths: @@ -275,6 +276,12 @@ def __validate_group_references(self): "does not exist, but is required by the following groups:\n"+ item.refstr()) + def __validate_gfa2_positions(self): + if self.version == "gfa1": + return + for line in self.edges + self.fragments: + line.validate_positions() + def _validate_version(self): if (self._version != None) and (self._version not in gfapy.VERSIONS): raise gfapy.VersionError("GFA specification version {} not supported". diff --git a/gfapy/line/edge/gfa2/gfa2.py b/gfapy/line/edge/gfa2/gfa2.py index cd6fafd..2702ae2 100644 --- a/gfapy/line/edge/gfa2/gfa2.py +++ b/gfapy/line/edge/gfa2/gfa2.py @@ -4,10 +4,11 @@ from ..gfa2.alignment_type import AlignmentType as GFA2_AlignmentType from ..gfa2.references import References from ..gfa2.other import Other +from ..gfa2.validation import Validation from ..edge import Edge class GFA2(Other, References, GFA2_AlignmentType, AlignmentType, FromTo, - ToGFA1, Edge): + ToGFA1, Validation, Edge): """An edge line of a GFA2 file.""" RECORD_TYPE = "E" diff --git a/gfapy/line/edge/gfa2/validation.py b/gfapy/line/edge/gfa2/validation.py new file mode 100644 index 0000000..2f8e20e --- /dev/null +++ b/gfapy/line/edge/gfa2/validation.py @@ -0,0 +1,22 @@ +import gfapy + +class Validation: + + def validate_positions(self): + "Checks that positions suffixed by $ are the last position of segments" + if self.is_connected(): + for n in ["1","2"]: + seg = self.get("sid"+n).line + seq = seg.sequence + if not gfapy.is_placeholder(seq): + seqlen = len(seq) + for pfx in ["beg", "end"]: + fn = pfx+n + pos = self.get(fn) + if gfapy.islastpos(pos): + if pos != seqlen: + raise gfapy.InconsistencyError( + "Edge: {}\n".format(str(self))+ + "Field {}: $ after ".format(fn)+ + "non-last position\n".format(str(pos))+ + "Segment: {}".format(str(seg))) diff --git a/gfapy/line/fragment/fragment.py b/gfapy/line/fragment/fragment.py index 0ffa9ed..c8a9cb6 100644 --- a/gfapy/line/fragment/fragment.py +++ b/gfapy/line/fragment/fragment.py @@ -1,7 +1,8 @@ from .references import References +from .validation import Validation from ..line import Line -class Fragment(References, Line): +class Fragment(References, Validation, Line): """ A fragment line of a GFA2 file """ diff --git a/gfapy/line/fragment/validation.py b/gfapy/line/fragment/validation.py new file mode 100644 index 0000000..82811ff --- /dev/null +++ b/gfapy/line/fragment/validation.py @@ -0,0 +1,21 @@ +import gfapy + +class Validation: + + def validate_positions(self): + "Checks that positions suffixed by $ are the last position of segments" + if self.is_connected(): + seg = self.get("sid") + seq = seg.sequence + if not gfapy.is_placeholder(seq): + seqlen = len(seq) + for sfx in ["beg", "end"]: + fn = "s_"+sfx + pos = self.get(fn) + if gfapy.islastpos(pos): + if pos != seqlen: + raise gfapy.InconsistencyError( + "Edge: {}\n".format(str(self))+ + "Field {}: $ after ".format(str(fn))+ + "non-last position ({})\n".format(str(pos))+ + "Segment: {}".format(str(seg))) diff --git a/gfapy/line/segment/length_gfa1.py b/gfapy/line/segment/length_gfa1.py index bcae369..4c3033a 100644 --- a/gfapy/line/segment/length_gfa1.py +++ b/gfapy/line/segment/length_gfa1.py @@ -51,6 +51,7 @@ def validate_length(self): if not gfapy.is_placeholder(self.sequence) and "LN" in self.tagnames: if self.LN != len(self.sequence): raise gfapy.InconsistencyError( + "Segment: {}\n".format(str(self))+ "Length in LN tag ({}) ".format(self.LN)+ "is different from length of sequence field ({})" .format(len(self.sequence))) diff --git a/tests/testdata/invalid/edge_wrong_lastpos.gfa2 b/tests/testdata/invalid/edge_wrong_lastpos.gfa2 new file mode 100644 index 0000000..90cd30b --- /dev/null +++ b/tests/testdata/invalid/edge_wrong_lastpos.gfa2 @@ -0,0 +1,12 @@ +H VN:Z:2.0 +H ul:Z:https://github.com/sjackman/assembly-graph/blob/master/sample.gfa +S 1 8 CGATGCAA +S 2 10 TGCAAAGTAC +S 3 21 TGCAACGTATAGACTTGTCAC RC:i:4 +S 4 7 GCATATA +S 5 8 CGATGATA +S 6 4 ATGA +E * 1+ 2+ 3 9$ 0 5 5M +E * 3+ 2+ 21$ 21$ 0 0 0M +E * 3+ 4- 17 21$ 3 7$ 1M1D2M +E * 4- 5+ 0 0 0 0 0M diff --git a/tests/testdata/invalid/fragment_wrong_lastpos.gfa2 b/tests/testdata/invalid/fragment_wrong_lastpos.gfa2 new file mode 100644 index 0000000..9018c52 --- /dev/null +++ b/tests/testdata/invalid/fragment_wrong_lastpos.gfa2 @@ -0,0 +1,33 @@ +# File used for the collections test +# similar but NOT equivalent to the gfa1 file! +S 1 122 * +S 3 29 TGCTAGCTGACTGTCGATGCTGTGTG +E 1_to_2 1+ 2+ 110 122$ 0 12 12M +S 5 130 * +S 13 150 * +E 2_to_6 2+ 6+ 0 122$ 10 132 122M +O 14 11+ 12+ +S 11 140 * xx:i:11 +F 3 read1+ 0 42$ 12 55 * id:Z:read1_in_3 +F 2 read2+ 45 62 0 18 * id:Z:read2_in_2 +U 16 1 3 15 2_to_6 16sub +H ac:Z:test2 +# another comment +S 12 150 * +S 4 120 * +H VN:Z:2.0 +E 1_to_3 1+ 3+ 112 122$ 0 12 10M +G 1_to_11 1+ 11- 120 * +E 11_to_12 11+ 12+ 18 140$ 0 122 122M +S 6 150 * +X custom_record xx:Z:testtag +X custom_record X2 +E 11_to_13 11+ 13+ 20 140$ 0 120 120M +G 2_to_12 2- 12+ 500 50 +O 15 11+ 11_to_13+ 13+ xx:i:-1 +Y another_custom_record +U 16sub 2 3 +S 2 120 * xx:Z:sometag +H aa:i:12 ab:Z:test1 +H aa:i:15 +E 1_to_5 1+ 5+ 0 122$ 2 124 * zz:Z:tag diff --git a/tests/testdata/invalid/inconsistent_length.gfa1 b/tests/testdata/invalid/inconsistent_length.gfa1 new file mode 100644 index 0000000..ad912e0 --- /dev/null +++ b/tests/testdata/invalid/inconsistent_length.gfa1 @@ -0,0 +1,12 @@ +H VN:Z:1.0 +H ul:Z:https://github.com/sjackman/assembly-graph/blob/master/sample.gfa +S 1 CGATGCAA LN:i:12 +S 2 TGCAAAGTAC +S 3 TGCAACGTATAGACTTGTCAC RC:i:4 +S 4 GCATATA +S 5 CGATGATA +S 6 ATGA +L 1 + 2 + 5M +L 3 + 2 + 0M +L 3 + 4 - 1M1D2M1S +L 4 - 5 + 0M