diff --git a/lib/bio-assembly.rb b/lib/bio-assembly.rb index 6dbd894..7c950b6 100644 --- a/lib/bio-assembly.rb +++ b/lib/bio-assembly.rb @@ -1,169 +1,55 @@ - -require 'bio/sequence' +require 'bio/sequence' require 'bio-assembly/contig' require 'bio-assembly/read' module Bio - class Assembly - attr_accessor :contigs - - def initialize(path) - @file = File.new(path, 'r') - @contigs = Array.new - parse_as - end - - def contigs - # use each_contig to stream large files - parse_whole_file if @contigs.empty? - @contigs - end - - def each_contig - # check if file is already parsed - if @total_num_contigs.to_i == @contigs.size - @contigs.each{ |contig| yield contig } - else - each_identifier do |identifier, attrs| - next unless identifier == 'CO' - contig = parse_contig(attrs) - @contigs.push contig - yield(contig) - end + class Assembly + attr_accessor :contigs + + @@formats = { } + + def self.create(path, format) + streamer = @@formats[format] + if streamer + streamer.new(path) + else + raise "Format type '#{format}' is not supported" + end end - end - - def to_ace - ace = "" - ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n" - each_contig { |contig| ace += contig.to_ace + "\n" } - ace - end - - private - - def parse_contig(attrs) - contig = Bio::Assembly::Contig.new - contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ") - # keep track of the number of RD identifiers parsed - @num_rds_parsed = 0 - # get sequence - seq = @file.gets("\n\n").tr(" \r\n", "") - contig.seq = seq - - # loop through identifiers (e.g AF, RD, etc) - each_identifier do |identifier, attrs| - case identifier - when "BQ" then parse_bq(contig) - when "AF" then parse_af(contig, attrs) - when "BS" then parse_bs(contig, attrs) - when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i - when "WR" then parse_wr(contig, attrs) - when "RT" then parse_rt(contig, attrs) - when "CT" then parse_ct(contig, attrs) - when "WA" then parse_wa(contig, attrs) - end + def self.register_parser name + @@formats[name] = self end - - contig - end - - # Finds the next_identifier - def each_identifier - @file.each do |line| - next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/ - yield(line[0..1], line[3..-1]) + + def contigs + # use each_contig to stream large files + parse_whole_file if @contigs.empty? + @contigs end - end - - # parse assembly meta data - def parse_as - line = @file.gets - identifier, @total_num_contigs, total_num_reads = line.split(" ") - end - - # parse contig sequence quality data - def parse_bq(contig) - contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ') - end - - # parse read meta data - def parse_af(contig, attrs) - read = Bio::Assembly::Read.new - read.name , read.orientation, read.from = attrs.split(" ") - contig.add_read read - end - - # parse base sequence data - def parse_bs(contig, attrs) - from, to, read_name = attrs.split(" ") - read = contig.find_read_by_name( read_name ) - read.add_base_sequence(from, to, read_name) - end - - # parse read sequence and position data - def parse_rd(contig, attrs) - # increment counter - @num_rds_parsed += 1 - # parse read - read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ") - seq = @file.gets("\n\n").tr( " \r\n", "") - - # get read with matching name - read = contig.find_read_by_name( read_name ) - read.seq = seq - read.to = read.from.to_i + read.seq.length - # set read.to to contig length if read runs off contig - read.to = contig.seq.length if read.to > contig.seq.length - - # if present parse QA and DS associated with this read - each_identifier do |identifier, attrs| - case identifier - when "QA" then parse_qa(read, attrs) - when "DS" then parse_ds(read, attrs); break - end + def each_contig + # implemented by each format subclass + end + + private + + def num_contigs + contigs.size end - - end - - # parse a read's clear ranges (the part of the read that contributes to the contig) - def parse_qa(read, attrs) - start, stop, clear_range_from, clear_range_to = attrs.split(" ") - read.clear_range_from = clear_range_from - read.clear_range_to = clear_range_to - end - - # parse file data - ignored - def parse_ds(read, attrs) - end - - # parse run meta data - ignored - def parse_wa(contig, attrs) - end - - # parse run meta data - ignored - def parse_ct(contig, attrs) - end - - def num_contigs - contigs.size - end - - def num_reads - read_num = 0 - each_contig { |contig| read_num += contig.num_reads } - read_num - end - - def parse_whole_file - each_contig { |x| 1 } - end - - end -end + def num_reads + read_num = 0 + each_contig { |contig| read_num += contig.num_reads } + read_num + end + def parse_whole_file + each_contig { |x| 1 } + end + + end + +end +require 'bio-assembly/ace' \ No newline at end of file diff --git a/lib/bio-assembly/ace.rb b/lib/bio-assembly/ace.rb new file mode 100644 index 0000000..f9c0319 --- /dev/null +++ b/lib/bio-assembly/ace.rb @@ -0,0 +1,146 @@ + +module Bio +class Assembly + +class Ace < Bio::Assembly + + # register parser with superclass + register_parser :ace + + def initialize(path) + @file = File.new(path, 'r') + @contigs = Array.new + parse_as + end + + def each_contig + # check if file is already parsed + if @total_num_contigs.to_i == @contigs.size + @contigs.each{ |contig| yield contig } + else + each_identifier do |identifier, attrs| + next unless identifier == 'CO' + contig = parse_contig(attrs) + @contigs.push contig + yield(contig) + end + end + end + + def to_ace + ace = "" + ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n" + each_contig { |contig| ace += contig.to_ace + "\n" } + ace + end + + private + def parse_contig(attrs) + contig = Bio::Assembly::Contig.new + contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ") + # keep track of the number of RD identifiers parsed + @num_rds_parsed = 0 + + # get sequence + seq = @file.gets("\n\n").tr(" \r\n", "") + contig.seq = seq + + # loop through identifiers (e.g AF, RD, etc) + each_identifier do |identifier, attrs| + case identifier + when "BQ" then parse_bq(contig) + when "AF" then parse_af(contig, attrs) + when "BS" then parse_bs(contig, attrs) + when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i + when "WR" then parse_wr(contig, attrs) + when "RT" then parse_rt(contig, attrs) + when "CT" then parse_ct(contig, attrs) + when "WA" then parse_wa(contig, attrs) + end + end + + contig + end + + # Finds the next_identifier + def each_identifier + @file.each do |line| + next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/ + yield(line[0..1], line[3..-1]) + end + end + + # parse assembly meta data + def parse_as + line = @file.gets + identifier, @total_num_contigs, total_num_reads = line.split(" ") + end + + # parse contig sequence quality data + def parse_bq(contig) + contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ') + end + + # parse read meta data + def parse_af(contig, attrs) + read = Bio::Assembly::Read.new + read.name , read.orientation, read.from = attrs.split(" ") + contig.add_read read + end + + # parse base sequence data + def parse_bs(contig, attrs) + from, to, read_name = attrs.split(" ") + read = contig.find_read_by_name( read_name ) + read.add_base_sequence(from, to, read_name) + end + + # parse read sequence and position data + def parse_rd(contig, attrs) + # increment counter + @num_rds_parsed += 1 + + # parse read + read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ") + seq = @file.gets("\n\n").tr( " \r\n", "") + + # get read with matching name + read = contig.find_read_by_name( read_name ) + read.seq = seq + read.to = read.from.to_i + read.seq.length + # set read.to to contig length if read runs off contig + read.to = contig.seq.length if read.to > contig.seq.length + + # if present parse QA and DS associated with this read + each_identifier do |identifier, attrs| + case identifier + when "QA" then parse_qa(read, attrs) + when "DS" then parse_ds(read, attrs); break + end + end + + end + + # parse a read's clear ranges (the part of the read that contributes to the contig) + def parse_qa(read, attrs) + start, stop, clear_range_from, clear_range_to = attrs.split(" ") + read.clear_range_from = clear_range_from + read.clear_range_to = clear_range_to + end + + # parse file data - ignored + def parse_ds(read, attrs) + end + + # parse run meta data - ignored + def parse_wa(contig, attrs) + end + + # parse run meta data - ignored + def parse_ct(contig, attrs) + end + +end # => end class Ace + +end # => end module Assembly +end # => end module Bio \ No newline at end of file diff --git a/lib/bio-assembly/contig.rb b/lib/bio-assembly/contig.rb index e629e7b..70f7abb 100644 --- a/lib/bio-assembly/contig.rb +++ b/lib/bio-assembly/contig.rb @@ -1,7 +1,10 @@ +require 'bio-assembly/contig/ace' + module Bio class Assembly class Contig + include Bio::Assembly::Contig::Ace attr_accessor :seq, :orientation, :quality, :to, :from, :name, :reads alias consensus_seq seq @@ -59,39 +62,9 @@ def num_base_segments end num_base_sequences end - - def to_ace - ace = "" - ace += ['CO', name, num_bases, num_reads, num_base_segments, orientation].join(' ') + "\n" - ace += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n" - ace += "BQ\n" - last_stop = quality.size - 1 - (quality.size/50+1).times do |i| - start = i * 50 - stop = (i+1) * 50 - 1 - stop = last_stop if stop > last_stop - ace += ' ' + quality[start..stop].join(' ') + "\n" - end - ace += "\n" - - # holds BS data for reads - bs_str = "" - # holds RD, QA, and DS data for reads - rest_str = "" - @reads.values.sort.each do |read| - ace += read.to_ace_af - bs_str += read.to_ace_bs - rest_str += read.to_ace_rest - end - - # compile data in correct order - ace += bs_str - ace += "\n" - ace += rest_str - ace - end end end -end \ No newline at end of file +end + diff --git a/lib/bio-assembly/contig/ace.rb b/lib/bio-assembly/contig/ace.rb new file mode 100644 index 0000000..cfc11b0 --- /dev/null +++ b/lib/bio-assembly/contig/ace.rb @@ -0,0 +1,43 @@ +module Bio + class Assembly + class Contig + + # ace specific methods for contig objects + module Ace + + def to_ace + ace = "" + ace += ['CO', name, num_bases, num_reads, num_base_segments, orientation].join(' ') + "\n" + ace += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n" + ace += "BQ\n" + last_stop = quality.size - 1 + (quality.size/50+1).times do |i| + start = i * 50 + stop = (i+1) * 50 - 1 + stop = last_stop if stop > last_stop + ace += ' ' + quality[start..stop].join(' ') + "\n" + end + ace += "\n" + + # holds BS data for reads + bs_str = "" + # holds RD, QA, and DS data for reads + rest_str = "" + @reads.values.sort.each do |read| + ace += read.to_ace_af + bs_str += read.to_ace_bs + rest_str += read.to_ace_rest + end + + # compile data in correct order + ace += bs_str + ace += "\n" + ace += rest_str + ace + end + + end + + end + end +end \ No newline at end of file diff --git a/lib/bio-assembly/read.rb b/lib/bio-assembly/read.rb index 17bdd81..9d10418 100644 --- a/lib/bio-assembly/read.rb +++ b/lib/bio-assembly/read.rb @@ -34,24 +34,7 @@ def clear_range_from=(new_clear_range_from) def clear_range_to=(new_clear_range_to) @clear_range_to = new_clear_range_to.to_i end - - def to_ace - ace += "" - # holds BS data for reads - bs_str = "" - # holds RD, QA, and DS data for reads - rest_str = "" - ace += to_ace_af - bs_str += to_ace_bs - rest_str = to_ace_rest - - # compile data in correct order - ace += bs_str - ace += "\n" - ace += rest_str - ace - end - + def <=>(other) unless other.kind_of?(Bio::Assembly::Read) raise "[Error] markers are not comparable" @@ -64,29 +47,6 @@ def <=>(other) end end - def to_ace_bs - bs_str = "" - unless base_sequences.nil? - base_sequences.each do |bs| - bs_str += ['BS', bs.from, bs.to, bs.read_name].join(' ') + "\n" - end - end - bs_str - end - - def to_ace_af - ['AF', name, orientation, from].join(' ') + "\n" - end - - def to_ace_rest - rest_str = "" - rest_str += ['RD', name, num_bases, 0, 0].join(' ') + "\n" - rest_str += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n" - rest_str += ['QA', clear_range_from, clear_range_to, clear_range_from, clear_range_to].join(' ') + "\n" - rest_str += ['DS', 'CHROMAT_FILE:', name, 'PHD_FILE:', "#{name}.phd.1", 'TIME:', Time.now].join(' ') + "\n" - rest_str - end - end end diff --git a/lib/bio-assembly/read/ace.rb b/lib/bio-assembly/read/ace.rb index 040b2e8..d9fa1bd 100644 --- a/lib/bio-assembly/read/ace.rb +++ b/lib/bio-assembly/read/ace.rb @@ -2,9 +2,50 @@ module Bio class Assembly class Read + # ace specific methods for read objects module Ace attr_accessor :base_sequences + def to_ace + ace += "" + # holds BS data for reads + bs_str = "" + # holds RD, QA, and DS data for reads + rest_str = "" + ace += to_ace_af + bs_str += to_ace_bs + rest_str = to_ace_rest + + # compile data in correct order + ace += bs_str + ace += "\n" + ace += rest_str + ace + end + + def to_ace_bs + bs_str = "" + unless base_sequences.nil? + base_sequences.each do |bs| + bs_str += ['BS', bs.from, bs.to, bs.read_name].join(' ') + "\n" + end + end + bs_str + end + + def to_ace_af + ['AF', name, orientation, from].join(' ') + "\n" + end + + def to_ace_rest + rest_str = "" + rest_str += ['RD', name, num_bases, 0, 0].join(' ') + "\n" + rest_str += seq.to_s.gsub(Regexp.new(".{1,50}"), "\\0\n") + "\n" + rest_str += ['QA', clear_range_from, clear_range_to, clear_range_from, clear_range_to].join(' ') + "\n" + rest_str += ['DS', 'CHROMAT_FILE:', name, 'PHD_FILE:', "#{name}.phd.1", 'TIME:', Time.now].join(' ') + "\n" + rest_str + end + def add_base_sequence(from, to, read_name) @base_sequences = Array.new if @base_sequences.nil? @base_sequences.push BaseSequence.new(from, to, read_name) diff --git a/test/test_bio-assembly.rb b/test/test_bio-assembly.rb index 5b2034d..3c70501 100644 --- a/test/test_bio-assembly.rb +++ b/test/test_bio-assembly.rb @@ -4,7 +4,7 @@ class TestBioAssembly < Test::Unit::TestCase def setup ace_filename = File.join('data', 'example1.ace') - @obj = Bio::Assembly.new(ace_filename) + @obj = Bio::Assembly.create(ace_filename, :ace) # pick a contig to do in depth tests on @contig = nil