Skip to content

Commit

Permalink
Assign an id to L/C lines when converting to E
Browse files Browse the repository at this point in the history
Closes #4
  • Loading branch information
Giorgio Gonnella committed Apr 6, 2017
1 parent 43a8c2f commit 9b1644b
Show file tree
Hide file tree
Showing 12 changed files with 79 additions and 27 deletions.
1 change: 1 addition & 0 deletions gfapy/gfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(self, *args, vlevel = 1, version = None):
if not version in ['gfa1', 'gfa2', None]:
raise gfapy.VersionError("GFA version unknown ({})".format(version))
self._vlevel = vlevel
self._unused_name_cache = 1
self._records = defaultdict(dict)
self._records["H"] = gfapy.line.Header(["H"], vlevel = vlevel)
self._records["H"].connect(self)
Expand Down
5 changes: 4 additions & 1 deletion gfapy/line/common/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,10 @@ def _apply_definitions(cls):
def _define_field_accessors(cls):
if not cls.PREDEFINED_TAGS:
cls.PREDEFINED_TAGS = list(set(cls.DATATYPE.keys()) - set(cls.POSFIELDS))
for fieldname in cls.POSFIELDS + cls.PREDEFINED_TAGS:
fieldnames = cls.POSFIELDS + cls.PREDEFINED_TAGS
if cls.NAME_FIELD and cls.NAME_FIELD not in fieldnames:
fieldnames.append(cls.NAME_FIELD)
for fieldname in fieldnames:
def get_method(self, fieldname):
return self.get(fieldname)
def set_method(self, value, fieldname):
Expand Down
1 change: 1 addition & 0 deletions gfapy/line/edge/containment/containment.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class Containment(Containment_ToGFA2, Pos, Canonical, Other,
"container_orient" : "from_orient",
"contained_orient" : "to_orient"}
PREDEFINED_TAGS = ["MQ", "NM"]
NAME_FIELD = "id"
DATATYPE = {
"from_segment" : "segment_name_gfa1",
"from_orient" : "orientation",
Expand Down
13 changes: 7 additions & 6 deletions gfapy/line/edge/gfa1/to_gfa2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,10 @@ class ToGFA2:

@property
def eid(self):
"""The content of the id tag, if any; a Placeholder, if none"""
"""The content of the id tag, if any, otherwise assign an unused id"""
i = self.get("id")
if i is None:
return gfapy.Placeholder()
#i = "{}{} {}{} {}".format(self.from_name(), self.from_orient(),
# self.to_name(), self.to_orient(),
# self.overlap())
return i

name = eid
Expand Down Expand Up @@ -55,8 +52,12 @@ def alignment(self):

def _to_gfa2_a(self):
a = ["E"]
i = self.get("id")
a.append(str(i) if i else "*")
if not self.get("id") and self.is_connected():
self.set("id", self._gfa.unused_name())
if self.get("id"):
a.append(str(self.get("id")))
else:
a.append("*")
a.append(str(self.sid1))
a.append(str(self.sid2))
a += [ str(x) for x in self.from_coords ]
Expand Down
1 change: 1 addition & 0 deletions gfapy/line/edge/link/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class Link(Link_ToGFA2, GFA1_ToGFA2, Link_References, Equivalence, Complement, \
"FC" : "i",
"KC" : "i",
}
NAME_FIELD = "id"
REFERENCE_FIELDS = ["from_segment", "to_segment"]
BACKREFERENCE_RELATED_FIELDS = ["to_orient", "from_orient", "overlap"]
DEPENDENT_LINES = ["paths"]
Expand Down
7 changes: 1 addition & 6 deletions gfapy/line/group/path/to_gfa2.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@ def _to_gfa2_a(self):
if isinstance(oline.line, gfapy.line.segment.GFA1):
items.append(str(oline))
elif isinstance(oline.line, gfapy.line.edge.Link):
eid = oline.line.eid
if gfapy.is_placeholder(eid):
raise gfapy.ValueError(
"Link {} has no identifier\n".format(oline.line)+
"Path conversion to GFA2 failed")
items.append(eid + str(oline.orient))
items.append(oline.line.eid + str(oline.orient))
a = ["O"]
a.append(self.field_to_s("path_name"))
a.append(" ".join(items))
Expand Down
42 changes: 39 additions & 3 deletions gfapy/lines/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,17 @@ def segment_names(self):

@property
def edge_names(self):
"""List of the names of the edge (E) lines. The list is empty in GFA1.
"""List of the names of the edge (E, L, C) lines.
For the L and C lines, the content of the custom tag id
is taken as name.
"""
return self._gfa2_edge_names
if self._version == "gfa1":
return self._link_names + self._containment_names
elif self._version == "gfa2":
return self._gfa2_edge_names
else:
return self._gfa2_edge_names + self._link_names + self._containment_names

@property
def path_names(self):
Expand All @@ -215,13 +223,31 @@ def path_names(self):
def names(self):
"""All identifiers in the GFA identifiers namespace.
External sequence identifiers in F records are not included."""
Notes:
GFA1: in Gfapy the P and S namespaces are joined (i.e. paths with
the same name as segments are not accepted). Furthermore, to simplify
the conversion to/from GFA2, the id tag is used in L and C lines,
and their content is also included in the same namespace as the S/P
identifiers. GFA2: the namespace for identifiers is described in
the specification and includes all the S, E, G, U and O lines; the
external sequence identifiers in F lines are not included.
"""
return self.segment_names + \
self.edge_names + \
self.gap_names + \
self.path_names + \
self.set_names

def unused_name(self):
"""Compute a GFA identifier not yet in use in the Gfa object."""
names = self.names
name = str(self._unused_name_cache)
while name in names:
self._unused_name_cache += 1
name = str(self._unused_name_cache)
self._unused_name_cache += 1
return name

@property
def external_names(self):
"""List of the identifiers of external sequences mentioned in F records.
Expand All @@ -234,6 +260,16 @@ def _gfa2_edge_names(self):
d = self._records["E"]
return list([k for k in d.keys() if isinstance(k, str)])

@property
def _link_names(self):
d = self._records["L"]
return list([k for k in d.keys() if isinstance(k, str)])

@property
def _containment_names(self):
d = self._records["C"]
return list([k for k in d.keys() if isinstance(k, str)])

@property
def _gfa2_path_names(self):
d = self._records["O"]
Expand Down
2 changes: 2 additions & 0 deletions gfapy/placeholder.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ def __eq__(self, other):
def is_placeholder(object):
if object is Placeholder:
return True
elif object is None:
return True
elif object == "*":
return True
elif isinstance(object, list) and len(object) == 0:
Expand Down
6 changes: 3 additions & 3 deletions tests/test_api_lines_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_gfa1_collections(self):
self.assertRegex(gfa.comments[0].content, r'collections')
# containments
self.assertEqual(2, len(gfa.containments))
self.assertEqual(["2_to_6", "1_to_5"], [x.name for x in gfa.containments])
self.assertEqual({"2_to_6", "1_to_5"}, {x.name for x in gfa.containments})
# dovetails
self.assertEqual(4, len(gfa.dovetails))
self.assertEqual(set(["1_to_2", "1_to_3", "11_to_12", "11_to_13"]),
Expand All @@ -31,8 +31,8 @@ def test_gfa1_collections(self):
# path_names
self.assertSetEqual(set(["14", "15"]), set(gfa.path_names))
# names
self.assertSetEqual(set(gfa.segment_names + gfa.path_names),
set(gfa.names))
self.assertSetEqual(set(gfa.segment_names + gfa.path_names +
gfa.edge_names), set(gfa.names))
# lines
self.assertEqual(set([str(x) for x in gfa.comments + gfa.headers + gfa.segments + gfa.edges +
gfa.paths]), set([str(x) for x in gfa.lines]))
Expand Down
16 changes: 8 additions & 8 deletions tests/test_api_version_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_segment_conversion(self):
def test_link_conversion(self):
gfa1str = "L\tA\t+\tB\t-\t100M"
gfa1str_noov = "L\tA\t+\tB\t+\t*"
gfa2str = "E\t*\tA+\tB-\t100\t200$\t100\t200$\t100M"
gfa2str = "E\t1\tA+\tB-\t100\t200$\t100\t200$\t100M"
# not connected
self.assertRaises(gfapy.RuntimeError,gfapy.Line(gfa1str).to_gfa2)
# connected
Expand All @@ -55,15 +55,15 @@ def test_link_conversion(self):
gfa1line_noov = gfapy.Line(gfa1str_noov)
g.add_line(gfa1line_noov)
self.assertEqual(gfa2str,str(gfa1line.to_gfa2()))
self.assertEqual(gfa1str,str(gfa1line.to_gfa1()))
self.assertEqual(gfa1str+"\tid:Z:1",str(gfa1line.to_gfa1()))
# placeholder overlap
self.assertRaises(gfapy.ValueError,gfa1line_noov.to_gfa2)
# TODO check if the alignment is compatible with the segment length

def test_containment_conversion(self):
gfa1str = "C\tA\t+\tB\t-\t20\t100M"
gfa1str_noov = "C\tA\t+\tB\t+\t20\t*"
gfa2str = "E\t*\tA+\tB-\t20\t120\t0\t100$\t100M"
gfa2str = "E\t1\tA+\tB-\t20\t120\t0\t100$\t100M"
# not connected
self.assertRaises(gfapy.RuntimeError,gfapy.Line(gfa1str).to_gfa2)
# connected
Expand All @@ -74,8 +74,8 @@ def test_containment_conversion(self):
g.add_line(gfa1line)
gfa1line_noov = gfapy.Line(gfa1str_noov)
g.add_line(gfa1line_noov)
self.assertEqual(gfa2str,str( gfa1line.to_gfa2()))
self.assertEqual(gfa1str,str( gfa1line.to_gfa1()))
self.assertEqual(gfa2str,str(gfa1line.to_gfa2()))
self.assertEqual(gfa1str+"\tid:Z:1",str(gfa1line.to_gfa1()))
# placeholder overlap
self.assertRaises(gfapy.ValueError,gfa1line_noov.to_gfa2)
# TODO check if the alignment is compatible with the segment length
Expand All @@ -100,9 +100,9 @@ def test_L_to_E(self):
g.add_line("L\t1\t-\t2\t-\t20M")
g.add_line("L\t3\t-\t4\t+\t30M")
g.add_line("L\t3\t+\t4\t-\t40M")
expected_dovetails_gfa2 = {"E * 1+ 2+ 90 100$ 0 10 10M",
"E * 1- 2- 0 20 80 100$ 20M", "E * 3- 4+ 0 30 0 30 30M",
"E * 3+ 4- 60 100$ 60 100$ 40M"}
expected_dovetails_gfa2 = {"E 5 1+ 2+ 90 100$ 0 10 10M",
"E 6 1- 2- 0 20 80 100$ 20M", "E 7 3- 4+ 0 30 0 30 30M",
"E 8 3+ 4- 60 100$ 60 100$ 40M"}
dovetails_gfa2 = {g.dovetails[0].to_gfa2_s(),
g.dovetails[1].to_gfa2_s(), g.dovetails[2].to_gfa2_s(),
g.dovetails[3].to_gfa2_s()}
Expand Down
8 changes: 8 additions & 0 deletions tests/testdata/unnamed_and_named_links.gfa
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
S A AAAAAAACGT
S B ACGTCCACGT
S C CACGTCCGGG
S D GGGGGGGGGG
L A + B + 4M id:Z:2
L B + C + 5M
L C + D + 3M
P P1 A+,B+ 4M
4 changes: 4 additions & 0 deletions tests/testdata/unnamed_link.gfa
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
S A AAAAAAACGT
S B ACGTCCACGT
L A + B + 4M
P P1 A+,B+ 4M

0 comments on commit 9b1644b

Please sign in to comment.