Skip to content

Commit

Permalink
refactored assembly class to be parser agnostic
Browse files Browse the repository at this point in the history
  • Loading branch information
chmille4 committed Dec 27, 2010
1 parent 68483f6 commit 50f3f06
Show file tree
Hide file tree
Showing 7 changed files with 278 additions and 229 deletions.
196 changes: 41 additions & 155 deletions lib/bio-assembly.rb
@@ -1,169 +1,55 @@

require 'bio/sequence'
require 'bio/sequence'
require 'bio-assembly/contig'
require 'bio-assembly/read'

module Bio

class Assembly
attr_accessor :contigs

def initialize(path)
@file = File.new(path, 'r')
@contigs = Array.new
parse_as
end

def contigs
# use each_contig to stream large files
parse_whole_file if @contigs.empty?
@contigs
end

def each_contig
# check if file is already parsed
if @total_num_contigs.to_i == @contigs.size
@contigs.each{ |contig| yield contig }
else
each_identifier do |identifier, attrs|
next unless identifier == 'CO'
contig = parse_contig(attrs)
@contigs.push contig
yield(contig)
end
class Assembly
attr_accessor :contigs

@@formats = { }

def self.create(path, format)
streamer = @@formats[format]
if streamer
streamer.new(path)
else
raise "Format type '#{format}' is not supported"
end
end
end

def to_ace
ace = ""
ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n"
each_contig { |contig| ace += contig.to_ace + "\n" }
ace
end

private

def parse_contig(attrs)
contig = Bio::Assembly::Contig.new
contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ")
# keep track of the number of RD identifiers parsed
@num_rds_parsed = 0

# get sequence
seq = @file.gets("\n\n").tr(" \r\n", "")
contig.seq = seq

# loop through identifiers (e.g AF, RD, etc)
each_identifier do |identifier, attrs|
case identifier
when "BQ" then parse_bq(contig)
when "AF" then parse_af(contig, attrs)
when "BS" then parse_bs(contig, attrs)
when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i
when "WR" then parse_wr(contig, attrs)
when "RT" then parse_rt(contig, attrs)
when "CT" then parse_ct(contig, attrs)
when "WA" then parse_wa(contig, attrs)
end
def self.register_parser name
@@formats[name] = self
end

contig
end

# Finds the next_identifier
def each_identifier
@file.each do |line|
next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/
yield(line[0..1], line[3..-1])

def contigs
# use each_contig to stream large files
parse_whole_file if @contigs.empty?
@contigs
end
end

# parse assembly meta data
def parse_as
line = @file.gets
identifier, @total_num_contigs, total_num_reads = line.split(" ")
end

# parse contig sequence quality data
def parse_bq(contig)
contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ')
end

# parse read meta data
def parse_af(contig, attrs)
read = Bio::Assembly::Read.new
read.name , read.orientation, read.from = attrs.split(" ")
contig.add_read read
end

# parse base sequence data
def parse_bs(contig, attrs)
from, to, read_name = attrs.split(" ")
read = contig.find_read_by_name( read_name )
read.add_base_sequence(from, to, read_name)
end

# parse read sequence and position data
def parse_rd(contig, attrs)
# increment counter
@num_rds_parsed += 1

# parse read
read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ")
seq = @file.gets("\n\n").tr( " \r\n", "")

# get read with matching name
read = contig.find_read_by_name( read_name )
read.seq = seq
read.to = read.from.to_i + read.seq.length
# set read.to to contig length if read runs off contig
read.to = contig.seq.length if read.to > contig.seq.length

# if present parse QA and DS associated with this read
each_identifier do |identifier, attrs|
case identifier
when "QA" then parse_qa(read, attrs)
when "DS" then parse_ds(read, attrs); break
end
def each_contig
# implemented by each format subclass
end

private

def num_contigs
contigs.size
end

end

# parse a read's clear ranges (the part of the read that contributes to the contig)
def parse_qa(read, attrs)
start, stop, clear_range_from, clear_range_to = attrs.split(" ")
read.clear_range_from = clear_range_from
read.clear_range_to = clear_range_to
end

# parse file data - ignored
def parse_ds(read, attrs)
end

# parse run meta data - ignored
def parse_wa(contig, attrs)
end

# parse run meta data - ignored
def parse_ct(contig, attrs)
end

def num_contigs
contigs.size
end

def num_reads
read_num = 0
each_contig { |contig| read_num += contig.num_reads }
read_num
end

def parse_whole_file
each_contig { |x| 1 }
end

end

end
def num_reads
read_num = 0
each_contig { |contig| read_num += contig.num_reads }
read_num
end

def parse_whole_file
each_contig { |x| 1 }
end

end

end

require 'bio-assembly/ace'
146 changes: 146 additions & 0 deletions lib/bio-assembly/ace.rb
@@ -0,0 +1,146 @@

module Bio
class Assembly

class Ace < Bio::Assembly

# register parser with superclass
register_parser :ace

def initialize(path)
@file = File.new(path, 'r')
@contigs = Array.new
parse_as
end

def each_contig
# check if file is already parsed
if @total_num_contigs.to_i == @contigs.size
@contigs.each{ |contig| yield contig }
else
each_identifier do |identifier, attrs|
next unless identifier == 'CO'
contig = parse_contig(attrs)
@contigs.push contig
yield(contig)
end
end
end

def to_ace
ace = ""
ace += "AS " + num_contigs.to_s + " " + num_reads.to_s + "\n\n"
each_contig { |contig| ace += contig.to_ace + "\n" }
ace
end

private
def parse_contig(attrs)
contig = Bio::Assembly::Contig.new
contig.name, base_num, @num_reads, base_segments_num, contig.orientation = attrs.split(" ")
# keep track of the number of RD identifiers parsed
@num_rds_parsed = 0

# get sequence
seq = @file.gets("\n\n").tr(" \r\n", "")
contig.seq = seq

# loop through identifiers (e.g AF, RD, etc)
each_identifier do |identifier, attrs|
case identifier
when "BQ" then parse_bq(contig)
when "AF" then parse_af(contig, attrs)
when "BS" then parse_bs(contig, attrs)
when "RD" then parse_rd(contig, attrs); break if @num_rds_parsed == @num_reads.to_i
when "WR" then parse_wr(contig, attrs)
when "RT" then parse_rt(contig, attrs)
when "CT" then parse_ct(contig, attrs)
when "WA" then parse_wa(contig, attrs)
end
end

contig
end

# Finds the next_identifier
def each_identifier
@file.each do |line|
next if line !~ /^[ABCDQRW][ADFOQRST][\s\n].*/
yield(line[0..1], line[3..-1])
end
end

# parse assembly meta data
def parse_as
line = @file.gets
identifier, @total_num_contigs, total_num_reads = line.split(" ")
end

# parse contig sequence quality data
def parse_bq(contig)
contig.quality = @file.gets("\n\n").tr("\r\n", "").gsub(/^\s/, "").split(' ')
end

# parse read meta data
def parse_af(contig, attrs)
read = Bio::Assembly::Read.new
read.name , read.orientation, read.from = attrs.split(" ")
contig.add_read read
end

# parse base sequence data
def parse_bs(contig, attrs)
from, to, read_name = attrs.split(" ")
read = contig.find_read_by_name( read_name )
read.add_base_sequence(from, to, read_name)
end

# parse read sequence and position data
def parse_rd(contig, attrs)
# increment counter
@num_rds_parsed += 1

# parse read
read_name, num_padded_bases, num_read_infos, num_read_tags = attrs.split(" ")
seq = @file.gets("\n\n").tr( " \r\n", "")

# get read with matching name
read = contig.find_read_by_name( read_name )
read.seq = seq
read.to = read.from.to_i + read.seq.length
# set read.to to contig length if read runs off contig
read.to = contig.seq.length if read.to > contig.seq.length

# if present parse QA and DS associated with this read
each_identifier do |identifier, attrs|
case identifier
when "QA" then parse_qa(read, attrs)
when "DS" then parse_ds(read, attrs); break
end
end

end

# parse a read's clear ranges (the part of the read that contributes to the contig)
def parse_qa(read, attrs)
start, stop, clear_range_from, clear_range_to = attrs.split(" ")
read.clear_range_from = clear_range_from
read.clear_range_to = clear_range_to
end

# parse file data - ignored
def parse_ds(read, attrs)
end

# parse run meta data - ignored
def parse_wa(contig, attrs)
end

# parse run meta data - ignored
def parse_ct(contig, attrs)
end

end # => end class Ace

end # => end module Assembly
end # => end module Bio

0 comments on commit 50f3f06

Please sign in to comment.