Permalink
Browse files

Add phase information to CDS features from GenBank and add start/end …

…for annotation information. Thanks to Hugo A.M. Torres and Sascha Steinbiss http://genometools.lighthouseapp.com/projects/17855/tickets/61-error-parsing-gff3-file
  • Loading branch information...
1 parent 5352c68 commit 540b030c40e17a6a76b173aadefa8beb56a660f0 @chapmanb committed Mar 10, 2012
Showing with 35 additions and 5 deletions.
  1. +14 −5 gff/BCBio/GFF/GFFOutput.py
  2. +21 −0 gff/Scripts/gff/genbank_to_gff.py
View
19 gff/BCBio/GFF/GFFOutput.py
@@ -78,7 +78,7 @@ def write(self, recs, out_handle, include_fasta=False):
recs = [recs]
for rec in recs:
self._write_rec(rec, out_handle)
- self._write_annotations(rec.annotations, rec.id, out_handle)
+ self._write_annotations(rec.annotations, rec.id, len(rec.seq), out_handle)
for sf in rec.features:
sf = self._clean_feature(sf)
id_handler = self._write_feature(sf, rec.id, out_handle,
@@ -105,6 +105,15 @@ def _write_rec(self, rec, out_handle):
if len(rec.seq) > 0:
out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq)))
+ def _get_phase(self, feature):
+ if feature.qualifiers.has_key("phase"):
+ phase = feature.qualifiers["phase"][0]
+ elif feature.type == "CDS":
+ phase = int(feature.qualifiers.get("codon_start", [1])[0]) - 1
+ else:
+ phase = "."
+ return str(phase)
+
def _write_feature(self, feature, rec_id, out_handle, id_handler,
parent_id=None):
"""Write a feature with location information.
@@ -137,7 +146,7 @@ def _write_feature(self, feature, rec_id, out_handle, id_handler,
str(feature.location.nofuzzy_end),
feature.qualifiers.get("score", ["."])[0],
strand,
- str(feature.qualifiers.get("phase", ["."])[0]),
+ self._get_phase(feature),
self._format_keyvals(quals)]
out_handle.write("\t".join(parts) + "\n")
for sub_feature in feature.sub_features:
@@ -159,13 +168,13 @@ def _format_keyvals(self, keyvals):
format_kvs.append("%s=%s" % (key, ",".join(format_vals)))
return ";".join(format_kvs)
- def _write_annotations(self, anns, rec_id, out_handle):
+ def _write_annotations(self, anns, rec_id, size, out_handle):
"""Add annotations which refer to an entire sequence.
"""
format_anns = self._format_keyvals(anns)
if format_anns:
- parts = [rec_id, "annotation", "remark", ".", ".", ".", ".", ".",
- format_anns]
+ parts = [rec_id, "annotation", "remark", "1", str(size if size > 1 else 1),
+ ".", ".", ".", format_anns]
out_handle.write("\t".join(parts) + "\n")
def _write_header(self, out_handle):
View
21 gff/Scripts/gff/genbank_to_gff.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+"""Convert a GenBank file into GFF format.
+
+Usage:
+ genbank_to_gff.py <genbank_file>
+"""
+import sys
+import os
+
+from Bio import SeqIO
+from Bio import Seq
+
+from BCBio import GFF
+
+def main(gb_file):
+ out_file = "%s.gff" % os.path.splitext(gb_file)[0]
+ with open(out_file, "w") as out_handle:
+ GFF.write(SeqIO.parse(gb_file, "genbank"), out_handle)
+
+if __name__ == "__main__":
+ main(*sys.argv[1:])

0 comments on commit 540b030

Please sign in to comment.