From d9edf2a74e4d453b583ef255d1b0559c83c296c9 Mon Sep 17 00:00:00 2001 From: georgeG Date: Mon, 11 Mar 2013 15:42:17 +0300 Subject: [PATCH] write the parser --- lib/bio-cd-hit-report.rb | 12 ++++++- lib/bio-cd-hit-report/cd-hit-report.rb | 49 +++++--------------------- lib/bio-cd-hit-report/cluster.rb | 30 ++++++++-------- lib/bio-cd-hit-report/parser.rb | 19 ++++++++++ 4 files changed, 54 insertions(+), 56 deletions(-) create mode 100644 lib/bio-cd-hit-report/parser.rb diff --git a/lib/bio-cd-hit-report.rb b/lib/bio-cd-hit-report.rb index 8f7c311..d0f5c5e 100644 --- a/lib/bio-cd-hit-report.rb +++ b/lib/bio-cd-hit-report.rb @@ -1 +1,11 @@ -require_relative "bio-cd-hit-report/cd-hit-report" +require_relative 'bio-cd-hit-report/cd-hit-report' + +#report = Bio::CdHitReport.new('bin/test.clstr') +##report.report_file = 'bin/test.clstr' +#report.parse.each do |cluster| + #puts cluster.id + ##puts cluster.members + #puts cluster.rep_seq +#end + +#puts report.parse.length diff --git a/lib/bio-cd-hit-report/cd-hit-report.rb b/lib/bio-cd-hit-report/cd-hit-report.rb index 12d6b05..6f10ca1 100644 --- a/lib/bio-cd-hit-report/cd-hit-report.rb +++ b/lib/bio-cd-hit-report/cd-hit-report.rb @@ -1,49 +1,16 @@ module Bio - - require_relative 'cluster.rb' + require_relative 'cluster' + require_relative 'parser' class CdHitReport - def initialize(file) @file = file end - def each_cluster(&block) - cluster_objs.each(&block) - end - - def total_clusters - cluster_objs.size - end - - def get_cluster(name) - cluster_objs.select{|cluster| cluster.name == name.to_s}.pop.members - end - - def max_members - cluster_objs.map{|c|c.size}.max + def parse + report = CdHitParser.new + report.report_file = @file + report end - - def min_members - cluster_objs.map{|c| c.size}.min - end - - private - def cluster_objs - d = raw_data.map do |line| - cluster = line.split("\n").delete_if{|x| x == ">Cluster "} - id = cluster.first - cluster.shift - #puts id.inspect - Cluster.new(id,cluster) - end - d.delete_if {|obj| obj.id.nil?} - end - - - def raw_data - File.open(@file).readlines - end - - end #class -end #module + end +end diff --git a/lib/bio-cd-hit-report/cluster.rb b/lib/bio-cd-hit-report/cluster.rb index 14c67fd..c883a55 100644 --- a/lib/bio-cd-hit-report/cluster.rb +++ b/lib/bio-cd-hit-report/cluster.rb @@ -1,28 +1,30 @@ +class Cluster + attr_accessor :name, :data -class Cluster < Struct.new(:name,:data) - $/ = ">Cluster " - - def id - name + def initialize(arg={}) + @name = arg[:name] + @data = arg[:data] end - def size - entries.size + def id + name.scan(/Cluster\s(.)/).join end def members entries.join(',') end - def get_seqs(file) - seqs = Bio::FlatFile.auto(file).map{ |f| f} - puts entries.map{|entry| seqs.select {|seq| seq.definition == entry }} + def representative + @data.split("\n").map{|line|line.scan(/>(.+)\.{3}\s\*/)}.flatten end + alias :rep_seq :representative - private - def entries - data.map {|entry| entry.scan(/>(.+)\.{3}/)}.flatten + def size + entries.size end + alias :length :size + def entries + @data.split("\n").map{|line|line.scan(/>(.+)\.{3}/)} + end end - diff --git a/lib/bio-cd-hit-report/parser.rb b/lib/bio-cd-hit-report/parser.rb new file mode 100644 index 0000000..294f8f7 --- /dev/null +++ b/lib/bio-cd-hit-report/parser.rb @@ -0,0 +1,19 @@ +class CdHitParser + attr_accessor :report_file + + include Enumerable + + def each + data,header = nil, nil + File.open(report_file).each do |line| + if line[0].chr == '>' + yield Cluster.new(:name => header,:data => data) if data + data = '' + header = line[1..-1].strip + else + data << line + end + end + yield Cluster.new(:name => header, :data => data) + end +end