Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 682 lines (621 sloc) 28.515 kB
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
1 """Test decoration of existing SeqRecords with GFF through a SeqIO interface.
2 """
3 import sys
4 import os
5 import unittest
6 import pprint
bacaa8b @chapmanb Initial implementation of GFF3 output writer
authored
7 import StringIO
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
8
9 from Bio import SeqIO
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
10 from BCBio import GFF
1a0f7c8 @chapmanb Write out sequence-region directive; add test for writing from SeqRec…
authored
11 from Bio.Seq import Seq
12 from Bio.SeqRecord import SeqRecord
13 from Bio.SeqFeature import SeqFeature, FeatureLocation
51e7f27 @chapmanb Convert sequence region directives to Python 0-based indexing. Thanks…
authored
14 from BCBio.GFF import (GFFExaminer, GFFParser, DiscoGFFParser)
7e976e2 @chapmanb GFF Parsing framework using map-reduce with disco
authored
15
16 class MapReduceGFFTest(unittest.TestCase):
17 """Tests GFF parsing using a map-reduce framework for parallelization.
18 """
19 def setUp(self):
f216dba @chapmanb Support writing FASTA in GFF3 files using directive; thanks to Victor…
authored
20 self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
7e976e2 @chapmanb GFF Parsing framework using map-reduce with disco
authored
21 self._test_gff_file = os.path.join(self._test_dir,
22 "c_elegans_WS199_shortened_gff.txt")
23 self._disco_host = "http://localhost:7000"
24
25 def t_local_map_reduce(self):
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
26 """General map reduce framework without parallelization.
7e976e2 @chapmanb GFF Parsing framework using map-reduce with disco
authored
27 """
28 cds_limit_info = dict(
ec8790d @chapmanb Provide simple non-iterating interface to GFF; clean up methods for l…
authored
29 gff_type = ["gene", "mRNA", "CDS"],
7e976e2 @chapmanb GFF Parsing framework using map-reduce with disco
authored
30 gff_id = ['I']
31 )
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
32 rec_dict = SeqIO.to_dict(GFF.parse(self._test_gff_file,
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
33 limit_info=cds_limit_info))
34 test_rec = rec_dict['I']
35 assert len(test_rec.features) == 32
7e976e2 @chapmanb GFF Parsing framework using map-reduce with disco
authored
36
37 def t_disco_map_reduce(self):
38 """Map reduce framework parallelized using disco.
39 """
40 # this needs to be more generalized but fails okay with no disco
41 try:
42 import disco
43 import simplejson
44 except ImportError:
45 print "Skipping -- disco and json not found"
46 return
47 cds_limit_info = dict(
ec8790d @chapmanb Provide simple non-iterating interface to GFF; clean up methods for l…
authored
48 gff_source_type = [('Non_coding_transcript', 'gene'),
7e976e2 @chapmanb GFF Parsing framework using map-reduce with disco
authored
49 ('Coding_transcript', 'gene'),
50 ('Coding_transcript', 'mRNA'),
51 ('Coding_transcript', 'CDS')],
52 gff_id = ['I']
53 )
7c2648c @chapmanb New structure works with Disco parallelization
authored
54 parser = DiscoGFFParser(disco_host=self._disco_host)
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
55 rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file,
56 limit_info=cds_limit_info))
57 final_rec = rec_dict['I']
7e976e2 @chapmanb GFF Parsing framework using map-reduce with disco
authored
58 # second gene feature is multi-parent
59 assert len(final_rec.features) == 2 # two gene feature
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
60
0bf5217 @chapmanb Fix parsing error with key/value attributes lacking values
authored
61 class GFF3Test(unittest.TestCase):
62 """Real live GFF3 tests from WormBase and NCBI.
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
63
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
64 Uses GFF3 data from:
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
65
66 ftp://ftp.wormbase.org/pub/wormbase/genomes/c_elegans/
67 genome_feature_tables/GFF3/
68 ftp://ftp.wormbase.org/pub/wormbase/genomes/c_elegans/sequences/dna/
0bf5217 @chapmanb Fix parsing error with key/value attributes lacking values
authored
69
70 and from NCBI.
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
71 """
72 def setUp(self):
f216dba @chapmanb Support writing FASTA in GFF3 files using directive; thanks to Victor…
authored
73 self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
74 self._test_seq_file = os.path.join(self._test_dir,
75 "c_elegans_WS199_dna_shortened.fa")
76 self._test_gff_file = os.path.join(self._test_dir,
77 "c_elegans_WS199_shortened_gff.txt")
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
78 self._test_gff_ann_file = os.path.join(self._test_dir,
79 "c_elegans_WS199_ann_gff.txt")
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
80 self._full_dir = "/usr/home/chapmanb/mgh/ruvkun_rnai/wormbase/" + \
81 "data_files_WS198"
0bf5217 @chapmanb Fix parsing error with key/value attributes lacking values
authored
82 self._test_ncbi = os.path.join(self._test_dir,
83 "ncbi_gff3.txt")
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
84
85 def not_t_full_celegans(self):
86 """Test the full C elegans chromosome and GFF files.
87
88 This is used to test GFF on large files and is not run as a standard
89 test. You will need to download the files and adjust the paths
90 to run this.
91 """
92 # read the sequence information
93 seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa")
94 gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3")
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
95 seq_handle = open(seq_file)
96 seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
97 seq_handle.close()
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
98 #with open(gff_file) as gff_handle:
99 # possible_limits = feature_adder.available_limits(gff_handle)
100 # pprint.pprint(possible_limits)
101 rnai_types = [('Orfeome', 'PCR_product'),
102 ('GenePair_STS', 'PCR_product'),
103 ('Promoterome', 'PCR_product')]
104 gene_types = [('Non_coding_transcript', 'gene'),
105 ('Coding_transcript', 'gene'),
106 ('Coding_transcript', 'mRNA'),
107 ('Coding_transcript', 'CDS')]
ec8790d @chapmanb Provide simple non-iterating interface to GFF; clean up methods for l…
authored
108 limit_info = dict(gff_source_type = rnai_types + gene_types)
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
109 for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info):
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
110 pass
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
111
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
112 def _get_seq_dict(self):
113 """Internal reusable function to get the sequence dictionary.
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
114 """
115 seq_handle = open(self._test_seq_file)
116 seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
117 seq_handle.close()
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
118 return seq_dict
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
119
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
120 def t_possible_limits(self):
121 """Calculate possible queries to limit a GFF file.
122 """
a93d84f Provide additional diagnostics for assessing GFF files and move to se…
Brad Chapman authored
123 gff_examiner = GFFExaminer()
124 possible_limits = gff_examiner.available_limits(self._test_gff_file)
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
125 print
126 pprint.pprint(possible_limits)
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
127
a93d84f Provide additional diagnostics for assessing GFF files and move to se…
Brad Chapman authored
128 def t_parent_child(self):
129 """Summarize parent-child relationships in a GFF file.
130 """
131 gff_examiner = GFFExaminer()
132 pc_map = gff_examiner.parent_child_map(self._test_gff_file)
133 print
134 pprint.pprint(pc_map)
135
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
136 def t_flat_features(self):
137 """Check addition of flat non-nested features to multiple records.
138 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
139 seq_dict = self._get_seq_dict()
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
140 pcr_limit_info = dict(
ec8790d @chapmanb Provide simple non-iterating interface to GFF; clean up methods for l…
authored
141 gff_source_type = [('Orfeome', 'PCR_product'),
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
142 ('GenePair_STS', 'PCR_product'),
143 ('Promoterome', 'PCR_product')]
144 )
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
145 parser = GFFParser()
146 rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
147 limit_info=pcr_limit_info))
148 assert len(rec_dict['I'].features) == 4
149 assert len(rec_dict['X'].features) == 5
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
150
151 def t_nested_features(self):
152 """Check three-deep nesting of features with gene, mRNA and CDS.
153 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
154 seq_dict = self._get_seq_dict()
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
155 cds_limit_info = dict(
ec8790d @chapmanb Provide simple non-iterating interface to GFF; clean up methods for l…
authored
156 gff_source_type = [('Coding_transcript', 'gene'),
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
157 ('Coding_transcript', 'mRNA'),
158 ('Coding_transcript', 'CDS')],
159 gff_id = ['I']
160 )
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
161 parser = GFFParser()
162 rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
163 limit_info=cds_limit_info))
164 final_rec = rec_dict['I']
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
165 # first gene feature is plain
166 assert len(final_rec.features) == 2 # two gene feature
167 assert len(final_rec.features[0].sub_features) == 1 # one transcript
168 # 15 final CDS regions
169 assert len(final_rec.features[0].sub_features[0].sub_features) == 15
170
171 def t_nested_multiparent_features(self):
172 """Verify correct nesting of features with multiple parents.
173 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
174 seq_dict = self._get_seq_dict()
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
175 cds_limit_info = dict(
ec8790d @chapmanb Provide simple non-iterating interface to GFF; clean up methods for l…
authored
176 gff_source_type = [('Coding_transcript', 'gene'),
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
177 ('Coding_transcript', 'mRNA'),
178 ('Coding_transcript', 'CDS')],
179 gff_id = ['I']
180 )
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
181 parser = GFFParser()
182 rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict,
183 limit_info=cds_limit_info))
184 final_rec = rec_dict['I']
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
185 # second gene feature is multi-parent
186 assert len(final_rec.features) == 2 # two gene feature
187 cur_subs = final_rec.features[1].sub_features
188 assert len(cur_subs) == 3 # three transcripts
189 # the first and second transcript have the same CDSs
190 assert len(cur_subs[0].sub_features) == 6
191 assert len(cur_subs[1].sub_features) == 6
192 assert cur_subs[0].sub_features[0] is cur_subs[1].sub_features[0]
193
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
194 def t_no_dict_error(self):
195 """Ensure an error is raised when no dictionary to map to is present.
196 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
197 parser = GFFParser(create_missing=False)
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
198 try:
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
199 for rec in parser.parse(self._test_gff_file):
200 pass
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
201 # no error -- problem
202 raise AssertionError('Did not complain with missing dictionary')
203 except KeyError:
204 pass
205
9bf24ff Support for UnknownSeq to properly handle generated sequences
Brad Chapman authored
206 def t_unknown_seq(self):
207 """Prepare unknown base sequences with the correct length.
208 """
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
209 rec_dict = SeqIO.to_dict(GFF.parse(self._test_gff_file))
9bf24ff Support for UnknownSeq to properly handle generated sequences
Brad Chapman authored
210 assert len(rec_dict["I"].seq) == 12766937
211 assert len(rec_dict["X"].seq) == 17718531
212
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
213 def t_gff_annotations(self):
214 """Check GFF annotations placed on an entire sequence.
215 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
216 parser = GFFParser()
217 rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_ann_file))
218 final_rec = rec_dict['I']
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
219 assert len(final_rec.annotations.keys()) == 2
220 assert final_rec.annotations['source'] == ['Expr_profile']
221 assert final_rec.annotations['expr_profile'] == ['B0019.1']
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
222
223 def t_gff3_iterator(self):
224 """Iterated parsing in GFF3 files with nested features.
225 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
226 parser = GFFParser()
227 recs = [r for r in parser.parse_in_parts(self._test_gff_file,
228 target_lines=70)]
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
229 # should be one big set because we don't have a good place to split
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
230 assert len(recs) == 6
231 assert len(recs[0].features) == 59
64b3071 Add function parameter to allow adjustment of a GFF line on the fly
Brad Chapman authored
232
233 def t_gff3_iterator_limit(self):
234 """Iterated interface using a limit query on GFF3 files.
235 """
236 cds_limit_info = dict(
237 gff_source_type = [('Coding_transcript', 'gene'),
238 ('Coding_transcript', 'mRNA'),
239 ('Coding_transcript', 'CDS')],
240 gff_id = ['I']
241 )
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
242 parser = GFFParser()
243 rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file,
244 limit_info=cds_limit_info))
245 assert len(rec_dict) == 1
246 tfeature = rec_dict["I"].features[0].sub_features[0]
64b3071 Add function parameter to allow adjustment of a GFF line on the fly
Brad Chapman authored
247 for sub_test in tfeature.sub_features:
248 assert sub_test.type == "CDS", sub_test
249
0bf5217 @chapmanb Fix parsing error with key/value attributes lacking values
authored
250 def t_gff3_noval_attrib(self):
251 """Parse GFF3 file from NCBI with a key/value pair with no value.
252 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
253 parser = GFFParser()
254 rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi))
255 assert len(rec_dict) == 1
256 t_feature = rec_dict.values()[0].features[0]
0bf5217 @chapmanb Fix parsing error with key/value attributes lacking values
authored
257 assert t_feature.qualifiers["pseudo"] == ["true"]
258
37ecfe8 @chapmanb Provide fix for GFF files with non-unique ID attributes
authored
259 def t_gff3_multiple_ids(self):
260 """Deal with GFF3 with non-unique ID attributes, using NCBI example.
261 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
262 parser = GFFParser()
263 rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi))
264 assert len(rec_dict) == 1
265 t_features = rec_dict.values()[0].features[1:]
37ecfe8 @chapmanb Provide fix for GFF files with non-unique ID attributes
authored
266 # 4 feature sets, same ID, different positions, different attributes
267 assert len(t_features) == 4
268 for f in t_features:
269 assert len(f.sub_features) == 3
270
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
271 def t_simple_parsing(self):
272 """Parse GFF into a simple line by line dictionary without nesting.
273 """
274 parser = GFFParser()
275 num_lines = 0
276 for line_info in parser.parse_simple(self._test_gff_file):
277 num_lines += 1
278 assert num_lines == 177, num_lines
279 line_info = line_info['child'][0]
280 assert line_info['quals']['confirmed_est'] == \
281 ['yk1055g06.5', 'OSTF085G5_1']
282 assert line_info['location'] == [4582718, 4583189]
283
75e0078 @chapmanb Fix top level parse simple API for parent features. Generalize to han…
authored
284 def t_simple_parsing_nesting(self):
285 """Simple parsing for lines with nesting, using the simplified API.
286 """
287 test_gff = os.path.join(self._test_dir, "transcripts.gff3")
288 num_lines = 0
289 for line_info in GFF.parse_simple(test_gff):
290 num_lines += 1
291 assert num_lines == 16, num_lines
292
4670145 @chapmanb Handle extra trailing commas in GFF3 attributes; thanks to Vipin for …
authored
293 def t_extra_comma(self):
294 """Correctly handle GFF3 files with extra trailing commas.
295 """
296 tfile = os.path.join(self._test_dir, "mouse_extra_comma.gff3")
297 in_handle = open(tfile)
298 for rec in GFF.parse(in_handle):
299 pass
300 in_handle.close()
301 tested = False
302 for sub_top in rec.features[0].sub_features:
303 for sub in sub_top.sub_features:
304 if sub.qualifiers.get("Name", "") == ["CDS:NC_000083.5:LOC100040603"]:
305 tested = True
306 assert len(sub.qualifiers["Parent"]) == 1
307 assert tested, "Did not find sub-feature to test"
308
c530dc1 @chapmanb Handle GFF3 files with malformed key-value pairs and self-referential…
authored
309 def t_novalue_key(self):
310 """Handle GFF3 files with keys and no values.
311 """
312 tfile = os.path.join(self._test_dir, "glimmer_nokeyval.gff3")
313 rec = GFF.parse(tfile).next()
314 f1, f2 = rec.features
315 assert f1.qualifiers['ID'] == ['GL0000006']
316 assert len(f1.sub_features) == 2
317 assert f1.sub_features[0].qualifiers["Lack 3'-end"] == ["true"]
6b19891 @chapmanb v0.6.1: Provide Python3 compatibility fixes from @TheOneHyer. Also in…
authored
318 assert not "ID" in f1.sub_features[0].qualifiers
c530dc1 @chapmanb Handle GFF3 files with malformed key-value pairs and self-referential…
authored
319 assert f2.qualifiers["Complete"] == ["true"]
320
e43aa51 @chapmanb v0.6: Handle trans-splicing GFF cases where child locations may not m…
authored
321 def t_key_whitespace(self):
c690f32 @chapmanb bcbio-gff: version 0.5 with test for whitespace issues. Fixes #88
authored
322 """Fix keys with problematic whitespace.
323 """
324 tfile = os.path.join(self._test_dir, "spaces.gff3")
325 for i, line_info in enumerate(GFF.parse_simple(tfile)):
326 if i > 2:
e43aa51 @chapmanb v0.6: Handle trans-splicing GFF cases where child locations may not m…
authored
327 assert line_info["quals"]["foo"] == ["bar"]
328
329 def t_trans_spliicing(self):
330 """Parsing of transspliced genes from GFF3 spec where child locations don't match to parents.
331 """
332 fname = os.path.join(self._test_dir, "trans_splicing.gff3")
333 with open(fname) as in_handle:
334 rec = GFF.parse(in_handle).next()
335 assert len(rec.features) == 2
336 assert rec.features[0].id == "gene83"
337 assert len(rec.features[0].sub_features) == 2
338 assert len(rec.features[0].sub_features[0].sub_features) == 7
339
340 assert rec.features[1].id == "gene84"
341 assert len(rec.features[1].sub_features) == 2
342 assert len(rec.features[1].sub_features[0].sub_features) == 7
c690f32 @chapmanb bcbio-gff: version 0.5 with test for whitespace issues. Fixes #88
authored
343
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
344 class SolidGFFTester(unittest.TestCase):
345 """Test reading output from SOLiD analysis, as GFF3.
346
347 See more details on SOLiD GFF here:
348
349 http://solidsoftwaretools.com/gf/project/matogff/
350 """
351 def setUp(self):
f216dba @chapmanb Support writing FASTA in GFF3 files using directive; thanks to Victor…
authored
352 self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
353 self._test_gff_file = os.path.join(self._test_dir,
354 "F3-unique-3.v2.gff")
355
356 def t_basic_solid_parse(self):
357 """Basic parsing of SOLiD GFF results files.
358 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
359 parser = GFFParser()
360 rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file))
361 test_feature = rec_dict['3_341_424_F3'].features[0]
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
362 assert test_feature.location.nofuzzy_start == 102716
363 assert test_feature.location.nofuzzy_end == 102736
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
364 assert len(test_feature.qualifiers) == 7
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
365 assert test_feature.qualifiers['score'] == ['10.6']
366 assert test_feature.qualifiers['source'] == ['solid']
367 assert test_feature.strand == -1
368 assert test_feature.type == 'read'
369 assert test_feature.qualifiers['g'] == ['T2203031313223113212']
370 assert len(test_feature.qualifiers['q']) == 20
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
371
372 def t_solid_iterator(self):
373 """Iterated parsing in a flat file without nested features.
374 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
375 parser = GFFParser()
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
376 feature_sizes = []
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
377 for rec in parser.parse_in_parts(self._test_gff_file,
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
378 target_lines=5):
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
379 feature_sizes.append(len(rec.features))
380 assert len(feature_sizes) == 112
381 assert max(feature_sizes) == 1
be2f4f1 Remove old GFF parser in favor of map-reduce; add additional tests in…
Brad Chapman authored
382
9bf24ff Support for UnknownSeq to properly handle generated sequences
Brad Chapman authored
383 def t_line_adjust(self):
384 """Adjust lines during parsing to fix potential GFF problems.
385 """
386 def adjust_fn(results):
387 rec_index = results['quals']['i'][0]
388 read_name = results['rec_id']
389 results['quals']['read_name'] = [read_name]
390 results['rec_id'] = rec_index
391 return results
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
392 parser = GFFParser(line_adjust_fn=adjust_fn)
393 recs = [r for r in parser.parse(self._test_gff_file)]
394 assert len(recs) == 1
395 work_rec = recs[0]
396 assert work_rec.id == '1'
397 assert len(work_rec.features) == 112
398 assert work_rec.features[0].qualifiers['read_name'] == \
9bf24ff Support for UnknownSeq to properly handle generated sequences
Brad Chapman authored
399 ['3_336_815_F3']
400
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
401 class GFF2Tester(unittest.TestCase):
402 """Parse GFF2 and GTF files, building features.
403 """
404 def setUp(self):
f216dba @chapmanb Support writing FASTA in GFF3 files using directive; thanks to Victor…
authored
405 self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
406 self._ensembl_file = os.path.join(self._test_dir, "ensembl_gtf.txt")
407 self._wormbase_file = os.path.join(self._test_dir, "wormbase_gff2.txt")
408 self._jgi_file = os.path.join(self._test_dir, "jgi_gff2.txt")
3cd5760 @chapmanb Update to work with python2.4; include parsing of alternative GFF2 ne…
authored
409 self._wb_alt_file = os.path.join(self._test_dir,
410 "wormbase_gff2_alt.txt")
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
411
412 def t_basic_attributes(self):
413 """Parse out basic attributes of GFF2 from Ensembl GTF.
414 """
415 limit_info = dict(
ec8790d @chapmanb Provide simple non-iterating interface to GFF; clean up methods for l…
authored
416 gff_source_type = [('snoRNA', 'exon')]
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
417 )
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
418 rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file,
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
419 limit_info=limit_info))
420 work_rec = rec_dict['I']
421 assert len(work_rec.features) == 1
422 test_feature = work_rec.features[0]
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
423 qual_keys = test_feature.qualifiers.keys()
424 qual_keys.sort()
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
425 assert qual_keys == ['Parent', 'exon_number', 'gene_id', 'gene_name',
426 'source', 'transcript_id', 'transcript_name']
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
427 assert test_feature.qualifiers['source'] == ['snoRNA']
428 assert test_feature.qualifiers['transcript_name'] == ['NR_001477.2']
429 assert test_feature.qualifiers['exon_number'] == ['1']
430
431 def t_tricky_semicolons(self):
432 """Parsing of tricky semi-colon positions in WormBase GFF2.
433 """
434 limit_info = dict(
ec8790d @chapmanb Provide simple non-iterating interface to GFF; clean up methods for l…
authored
435 gff_source_type = [('Genomic_canonical', 'region')]
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
436 )
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
437 rec_dict = SeqIO.to_dict(GFF.parse(self._wormbase_file,
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
438 limit_info=limit_info))
439 work_rec = rec_dict['I']
440 assert len(work_rec.features) == 1
441 test_feature = work_rec.features[0]
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
442 assert test_feature.qualifiers['Note'] == \
32cdd10 @chapmanb Handle unescaped semi-colons within quotes and add test case. #83
authored
443 ['Clone cTel33B; Genbank AC199162', 'Clone cTel33B; Genbank AC199162'], test_feature.qualifiers["Note"]
444
445 def t_unescaped_semicolons(self):
446 """Parse inputs with unescaped semi-colons.
447 This is a band-aid to not fail rather than correct parsing, since
448 the combined feature will not be maintained.
449 """
450 f = os.path.join(self._test_dir, "unescaped-semicolon.gff3")
451 rec_dict = SeqIO.to_dict(GFF.parse(f))
452 f = rec_dict['chr1'].features[0]
453 assert f.qualifiers["Description"][0].startswith('osFTL6')
454 assert f.qualifiers["Description"][0].endswith('protein, expressed')
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
455
456 def t_jgi_gff(self):
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
457 """Parsing of JGI formatted GFF2, nested using transcriptId and proteinID
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
458 """
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
459 rec_dict = SeqIO.to_dict(GFF.parse(self._jgi_file))
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
460 tfeature = rec_dict['chr_1'].features[0]
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
461 assert tfeature.location.nofuzzy_start == 37060
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
462 assert tfeature.location.nofuzzy_end == 38216
463 assert tfeature.type == 'inferred_parent'
464 assert len(tfeature.sub_features) == 6
465 sfeature = tfeature.sub_features[1]
466 assert sfeature.qualifiers['proteinId'] == ['873']
467 assert sfeature.qualifiers['phase'] == ['0']
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
468
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
469 def t_ensembl_nested_features(self):
470 """Test nesting of features with GFF2 files using transcript_id.
471 """
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
472 rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file))
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
473 assert len(rec_dict["I"].features) == 2
474 t_feature = rec_dict["I"].features[0]
475 assert len(t_feature.sub_features) == 32
476
477 def t_wormbase_nested_features(self):
478 """Test nesting of features with GFF2 files using Transcript only.
479 """
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
480 rec_dict = SeqIO.to_dict(GFF.parse(self._wormbase_file))
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
481 assert len(rec_dict) == 3
482 parent_features = [f for f in rec_dict["I"].features if f.type ==
483 "Transcript"]
484 assert len(parent_features) == 1
485 inferred_features = [f for f in rec_dict["I"].features if f.type ==
486 "inferred_parent"]
487 assert len(inferred_features) == 0
488 tfeature = parent_features[0]
489 assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797"
490 assert len(tfeature.sub_features) == 46
491
3cd5760 @chapmanb Update to work with python2.4; include parsing of alternative GFF2 ne…
authored
492 def t_wb_cds_nested_features(self):
493 """Nesting of GFF2 features with a flat CDS key value pair.
494 """
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
495 rec_dict = SeqIO.to_dict(GFF.parse(self._wb_alt_file))
3cd5760 @chapmanb Update to work with python2.4; include parsing of alternative GFF2 ne…
authored
496 assert len(rec_dict) == 2
497 features = rec_dict.values()[1].features
498 assert len(features) == 1
499 tfeature = features[0]
500 assert tfeature.id == "cr01.sctg102.wum.2.1"
501 assert len(tfeature.sub_features) == 7
502
20007fd Provide iterated parsing interface for GFF; support nested features w…
Brad Chapman authored
503 def t_gff2_iteration(self):
504 """Test iterated features with GFF2 files, breaking without parents.
505 """
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
506 recs = []
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
507 for rec in GFF.parse(self._wormbase_file, target_lines=15):
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
508 recs.append(rec)
509 assert len(recs) == 4
510 assert recs[0].features[0].type == 'region'
511 assert recs[0].features[1].type == 'SAGE_tag'
512 assert len(recs[0].features[2].sub_features) == 29
c314940 @chapmanb Add GFF2 parsing capability with Tests; update gff_to_biosql script t…
authored
513
8b591ba @chapmanb Add support for directives and FASTA in GFF3
authored
514 class DirectivesTest(unittest.TestCase):
515 """Tests for parsing directives and other meta-data.
516 """
517 def setUp(self):
f216dba @chapmanb Support writing FASTA in GFF3 files using directive; thanks to Victor…
authored
518 self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
8b591ba @chapmanb Add support for directives and FASTA in GFF3
authored
519 self._gff_file = os.path.join(self._test_dir, "hybrid1.gff3")
520
521 def t_basic_directives(self):
522 """Parse out top level meta-data supplied in a GFF3 file.
523 """
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
524 recs = SeqIO.to_dict(GFF.parse(self._gff_file))
8b591ba @chapmanb Add support for directives and FASTA in GFF3
authored
525 anns = recs['chr17'].annotations
526 assert anns['gff-version'] == ['3']
527 assert anns['attribute-ontology'] == ['baz']
528 assert anns['feature-ontology'] == ['bar']
529 assert anns['source-ontology'] == ['boo']
51e7f27 @chapmanb Convert sequence region directives to Python 0-based indexing. Thanks…
authored
530 assert anns['sequence-region'] == [('foo', 0, 100), ('chr17',
531 62467933, 62469545)]
8b591ba @chapmanb Add support for directives and FASTA in GFF3
authored
532
533 def t_fasta_directive(self):
534 """Parse FASTA sequence information contained in a GFF3 file.
535 """
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
536 recs = SeqIO.to_dict(GFF.parse(self._gff_file))
8b591ba @chapmanb Add support for directives and FASTA in GFF3
authored
537 assert len(recs) == 1
538 test_rec = recs['chr17']
539 assert str(test_rec.seq) == "GATTACAGATTACA"
bcc8b5a @chapmanb Handle examining GFF files with FASTA directives
authored
540
541 def t_examiner_with_fasta(self):
542 """Perform high level examination of files with FASTA directives.
543 """
544 examiner = GFFExaminer()
545 pc_map = examiner.parent_child_map(self._gff_file)
546 assert pc_map[('UCSC', 'mRNA')] == [('UCSC', 'CDS')]
547 limits = examiner.available_limits(self._gff_file)
548 assert limits['gff_id'].keys()[0][0] == 'chr17'
549 assert sorted(limits['gff_source_type'].keys()) == \
550 [('UCSC', 'CDS'), ('UCSC', 'mRNA')]
8b591ba @chapmanb Add support for directives and FASTA in GFF3
authored
551
bacaa8b @chapmanb Initial implementation of GFF3 output writer
authored
552 class OutputTest(unittest.TestCase):
553 """Tests to write SeqFeatures to GFF3 output format.
554 """
555 def setUp(self):
f216dba @chapmanb Support writing FASTA in GFF3 files using directive; thanks to Victor…
authored
556 self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
bacaa8b @chapmanb Initial implementation of GFF3 output writer
authored
557 self._test_seq_file = os.path.join(self._test_dir,
558 "c_elegans_WS199_dna_shortened.fa")
559 self._test_gff_file = os.path.join(self._test_dir,
560 "c_elegans_WS199_shortened_gff.txt")
561 self._test_gff_ann_file = os.path.join(self._test_dir,
562 "c_elegans_WS199_ann_gff.txt")
9240a31 Updates for converting GFF2 to GFF3 output
Brad Chapman authored
563 self._wormbase_file = os.path.join(self._test_dir, "wormbase_gff2.txt")
bacaa8b @chapmanb Initial implementation of GFF3 output writer
authored
564
565 def t_gff3_to_gff3(self):
566 """Read in and write out GFF3 without any loss of information.
567 """
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
568 recs = SeqIO.to_dict(GFF.parse(self._test_gff_file))
bacaa8b @chapmanb Initial implementation of GFF3 output writer
authored
569 out_handle = StringIO.StringIO()
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
570 GFF.write(recs.values(), out_handle)
bacaa8b @chapmanb Initial implementation of GFF3 output writer
authored
571 wrote_handle = StringIO.StringIO(out_handle.getvalue())
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
572 recs_two = SeqIO.to_dict(GFF.parse(wrote_handle))
bacaa8b @chapmanb Initial implementation of GFF3 output writer
authored
573
574 orig_rec = recs.values()[0]
575 re_rec = recs.values()[0]
576 assert len(orig_rec.features) == len(re_rec.features)
577 for i, orig_f in enumerate(orig_rec.features):
578 assert str(orig_f) == str(re_rec.features[i])
579
9240a31 Updates for converting GFF2 to GFF3 output
Brad Chapman authored
580 def t_gff2_to_gff3(self):
581 """Read in GFF2 and write out as GFF3.
582 """
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
583 recs = SeqIO.to_dict(GFF.parse(self._wormbase_file))
9240a31 Updates for converting GFF2 to GFF3 output
Brad Chapman authored
584 out_handle = StringIO.StringIO()
da864d9 @chapmanb Simplify top level GFF parsing and writing with high level functions.
authored
585 GFF.write(recs.values(), out_handle)
9240a31 Updates for converting GFF2 to GFF3 output
Brad Chapman authored
586 wrote_handle = StringIO.StringIO(out_handle.getvalue())
587 # check some tricky lines in the GFF2 file
588 checks = 0
589 for line in wrote_handle:
590 if line.find("Interpolated_map_position") >= 0:
591 checks += 1
592 assert line.find("RFLP=No") > 0
593 if line.find("Gene=WBGene00000138") > 0:
594 checks += 1
595 assert line.find("ID=B0019.1") > 0
596 if line.find("translated_nucleotide_match\t12762127") > 0:
597 checks += 1
51e7f27 @chapmanb Convert sequence region directives to Python 0-based indexing. Thanks…
authored
598 assert line.find("Note=MSP:FADFSPLDVSDVNFATDDLAK") > 0
9240a31 Updates for converting GFF2 to GFF3 output
Brad Chapman authored
599 assert checks == 3, "Missing check line"
600
1a0f7c8 @chapmanb Write out sequence-region directive; add test for writing from SeqRec…
authored
601 def t_write_from_recs(self):
602 """Write out GFF3 from SeqRecord inputs.
603 """
604 seq = Seq("GATCGATCGATCGATCGATC")
605 rec = SeqRecord(seq, "ID1")
606 qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
607 "ID": "gene1"}
608 sub_qualifiers = {"source": "prediction"}
609 top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
610 qualifiers=qualifiers)
611 top_feature.sub_features = [SeqFeature(FeatureLocation(0, 5), type="exon", strand=1,
612 qualifiers=sub_qualifiers),
613 SeqFeature(FeatureLocation(15, 20), type="exon", strand=1,
614 qualifiers=sub_qualifiers)]
615 rec.features = [top_feature]
616 out_handle = StringIO.StringIO()
617 GFF.write([rec], out_handle)
618 wrote_info = out_handle.getvalue().split("\n")
619 assert wrote_info[0] == "##gff-version 3"
620 assert wrote_info[1] == "##sequence-region ID1 1 20"
51e7f27 @chapmanb Convert sequence region directives to Python 0-based indexing. Thanks…
authored
621 print wrote_info[2].split("\t")
1a0f7c8 @chapmanb Write out sequence-region directive; add test for writing from SeqRec…
authored
622 assert wrote_info[2].split("\t") == ['ID1', 'prediction', 'gene', '1',
623 '20', '10.0', '+', '.',
51e7f27 @chapmanb Convert sequence region directives to Python 0-based indexing. Thanks…
authored
624 'ID=gene1;other=Some,annotations']
1a0f7c8 @chapmanb Write out sequence-region directive; add test for writing from SeqRec…
authored
625 assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5',
626 '.', '+', '.', 'Parent=gene1']
627
f216dba @chapmanb Support writing FASTA in GFF3 files using directive; thanks to Victor…
authored
628 def t_write_fasta(self):
629 """Include FASTA records in GFF output.
630 """
631 seq = Seq("GATCGATCGATCGATCGATC")
632 rec = SeqRecord(seq, "ID1")
633 qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
634 "ID": "gene1"}
635 rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
636 qualifiers=qualifiers)]
637 out_handle = StringIO.StringIO()
638 GFF.write([rec], out_handle, include_fasta=True)
639 wrote_info = out_handle.getvalue().split("\n")
640 fasta_parts = wrote_info[3:]
641 assert fasta_parts[0] == "##FASTA"
642 assert fasta_parts[1] == ">ID1 <unknown description>"
643 assert fasta_parts[2] == str(seq)
644
5352c68 @chapmanb Allow writing single SeqRecords. Closes #51. Thanks to @mercutio22 an…
authored
645 def t_write_seqrecord(self):
646 """Write single SeqRecords.
647 """
648 seq = Seq("GATCGATCGATCGATCGATC")
649 rec = SeqRecord(seq, "ID1")
650 qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
651 "ID": "gene1"}
652 rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
653 qualifiers=qualifiers)]
654 out_handle = StringIO.StringIO()
655 GFF.write([rec], out_handle, include_fasta=True)
656 wrote_info = out_handle.getvalue().split("\n")
657 gff_line = wrote_info[2]
658 assert gff_line.split("\t")[0] == "ID1"
659
c690f32 @chapmanb bcbio-gff: version 0.5 with test for whitespace issues. Fixes #88
authored
660
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
661 def run_tests(argv):
662 test_suite = testing_suite()
663 runner = unittest.TextTestRunner(sys.stdout, verbosity = 2)
664 runner.run(test_suite)
665
666 def testing_suite():
667 """Generate the suite of tests.
668 """
669 test_suite = unittest.TestSuite()
670 test_loader = unittest.TestLoader()
e43aa51 @chapmanb v0.6: Handle trans-splicing GFF cases where child locations may not m…
authored
671 test_loader.testMethodPrefix = 't_'
0bf5217 @chapmanb Fix parsing error with key/value attributes lacking values
authored
672 tests = [GFF3Test, MapReduceGFFTest, SolidGFFTester, GFF2Tester,
bacaa8b @chapmanb Initial implementation of GFF3 output writer
authored
673 DirectivesTest, OutputTest]
b2abb08 Refactor interface to be more Biopython-like with parse function; upd…
Brad Chapman authored
674 #tests = [GFF3Test]
1a00b56 @chapmanb Initial implementation of GFF parsing to Biopython SeqFeatures.
authored
675 for test in tests:
676 cur_suite = test_loader.loadTestsFromTestCase(test)
677 test_suite.addTest(cur_suite)
678 return test_suite
679
680 if __name__ == "__main__":
681 sys.exit(run_tests(sys.argv))
Something went wrong with that request. Please try again.