Skip to content

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
...
  • 6 commits
  • 38 files changed
  • 0 commit comments
  • 1 contributor
Commits on Jul 27, 2012
@felipecsl Refactoring property locators.
Turning iterator (for_each) into a property locator.
Integration specs are failing. Need to refactor Parser
since it doenst have select_nodes and locate methods anymore.
Need to refactor the DSL to support new iterator format.
Need to get rid of Crawler#for_each and move that logic into
a property factory.
b5d186e
@felipecsl Adding new specialized locators b4ebff7
@felipecsl Major refactoring in progress 92838a2
Commits on Jul 30, 2012
@felipecsl Integration specs passing. 7 tests failing. Refactoring almost done 23bd1ae
@felipecsl Removing more obsolete code 0b8f821
@felipecsl Removing obsolete specs 5ae021b
Showing with 505 additions and 555 deletions.
  1. +5 −25 lib/wombat/crawler.rb
  2. +18 −0 lib/wombat/dsl/follower.rb
  3. +22 −0 lib/wombat/dsl/iterator.rb
  4. +26 −0 lib/wombat/dsl/metadata.rb
  5. +18 −0 lib/wombat/dsl/property.rb
  6. +50 −0 lib/wombat/dsl/property_container.rb
  7. +0 −5 lib/wombat/follower.rb
  8. +0 −43 lib/wombat/iterator.rb
  9. +0 −24 lib/wombat/metadata.rb
  10. +0 −10 lib/wombat/node_selector.rb
  11. +0 −59 lib/wombat/parser.rb
  12. +12 −0 lib/wombat/processing/node_selector.rb
  13. +47 −0 lib/wombat/processing/parser.rb
  14. +0 −21 lib/wombat/property.rb
  15. +30 −0 lib/wombat/property/locators/base.rb
  16. +39 −0 lib/wombat/property/locators/factory.rb
  17. +12 −0 lib/wombat/property/locators/follow.rb
  18. +14 −0 lib/wombat/property/locators/html.rb
  19. +23 −0 lib/wombat/property/locators/iterator.rb
  20. +17 −0 lib/wombat/property/locators/list.rb
  21. +20 −0 lib/wombat/property/locators/property_container.rb
  22. +22 −0 lib/wombat/property/locators/text.rb
  23. +0 −70 lib/wombat/property_container.rb
  24. +0 −32 lib/wombat/property_locator.rb
  25. +4 −21 spec/crawler_spec.rb
  26. +2 −2 spec/{ → dsl}/property_spec.rb
  27. +18 −19 spec/integration/integration_spec.rb
  28. +0 −52 spec/iterator_spec.rb
  29. +0 −20 spec/metadata_spec.rb
  30. +3 −15 spec/{ → processing}/parser_spec.rb
  31. +18 −0 spec/property/locators/factory_spec.rb
  32. +4 −0 spec/property/locators/follow_spec.rb
  33. +15 −0 spec/property/locators/html_spec.rb
  34. +4 −0 spec/property/locators/iterator_spec.rb
  35. +13 −0 spec/property/locators/list_spec.rb
  36. +49 −0 spec/property/locators/text_spec.rb
  37. +0 −62 spec/property_container_spec.rb
  38. +0 −75 spec/property_locator_spec.rb
View
30 lib/wombat/crawler.rb
@@ -1,13 +1,13 @@
#coding: utf-8
-require 'wombat/metadata'
-require 'wombat/property'
-require 'wombat/parser'
+require 'wombat/dsl/metadata'
+require 'wombat/dsl/property'
+require 'wombat/processing/parser'
require 'active_support'
require 'date'
module Wombat
module Crawler
- include Parser
+ include Processing::Parser
extend ActiveSupport::Concern
def crawl(&block)
@@ -37,37 +37,17 @@ def method_missing(method, *args, &block)
self.class.send method, *args, &block
end
- def for_each(selector, &block)
- self.class.for_each selector, &block
- end
-
- def follow_links(selector, options, &block)
- self.class.follow_links selector, options, &block
- end
-
module ClassMethods
def method_missing(method, *args, &block)
metadata.send method, *args, &block
end
- def for_each(selector, &block)
- metadata.for_each(selector).instance_eval(&block) if block
- end
-
- def follow_links(selector, options, &block)
-
- end
-
- def follow_links(selector)
-
- end
-
def to_ary
end
private
def metadata
- @metadata ||= Metadata.new
+ @metadata ||= DSL::Metadata.new
end
end
end
View
18 lib/wombat/dsl/follower.rb
@@ -0,0 +1,18 @@
+module Wombat
+ module DSL
+ class Follower < PropertyContainer
+ attr_accessor :name, :selector
+
+ def initialize(name, selector)
+ @name = name
+ @selector = selector
+
+ # Explicitly send 0 arguments to superclass constructor
+ super()
+ end
+
+ def parse(context)
+ end
+ end
+ end
+end
View
22 lib/wombat/dsl/iterator.rb
@@ -0,0 +1,22 @@
+require 'wombat/processing/node_selector'
+
+module Wombat
+ module DSL
+ class Iterator < PropertyContainer
+ attr_accessor :name, :selector
+
+ def initialize(name, selector)
+ @selector = selector
+
+ # Explicitly send 0 arguments to superclass constructor
+ super(name)
+ end
+
+ # So that Property::Locators::Iterator can identify this class
+ # as an iterator property.
+ def format
+ :iterator
+ end
+ end
+ end
+end
View
26 lib/wombat/dsl/metadata.rb
@@ -0,0 +1,26 @@
+#coding: utf-8
+require 'wombat/dsl/property_container'
+require 'wombat/dsl/iterator'
+
+module Wombat
+ module DSL
+ class Metadata < PropertyContainer
+ def initialize
+ self[:document_format] = :html
+ super
+ end
+
+ def base_url(url)
+ self[:base_url] = url
+ end
+
+ def list_page(url)
+ self[:list_page] = url
+ end
+
+ def document_format(format)
+ self[:document_format] = format
+ end
+ end
+ end
+end
View
18 lib/wombat/dsl/property.rb
@@ -0,0 +1,18 @@
+module Wombat
+ module DSL
+ class Property
+ attr_accessor :name, :selector, :format, :namespaces, :callback
+
+ # TODO: This class should receive method_name, args and block
+ # and do the assignment of properties itself, instead of receiving
+ # an options hash.
+ def initialize(options)
+ @name = options[:name]
+ @selector = options[:selector]
+ @format = options[:format] || :text
+ @namespaces = options[:namespaces]
+ @callback = options[:callback]
+ end
+ end
+ end
+end
View
50 lib/wombat/dsl/property_container.rb
@@ -0,0 +1,50 @@
+#coding: utf-8
+
+module Wombat
+ module DSL
+ class PropertyContainer < Hash
+ attr_accessor :name
+
+ def initialize(name = nil)
+ @name = name
+ end
+
+ def method_missing(method, *args, &block)
+ property_name = method.to_s
+
+ if args.empty? && block
+ self[property_name] = PropertyContainer.new(property_name) unless self[property_name]
+ block.call self[property_name]
+ else
+ unless args[1] == :iterator
+ self[property_name] = Property.new(
+ name: property_name,
+ selector: args.first,
+ format: args[1],
+ namespaces: args[2],
+ callback: block)
+ else
+ it = Iterator.new(property_name, args.first)
+ self[property_name] = it
+ it.instance_eval(&block) if block
+ end
+ end
+ end
+
+ def to_ary
+ end
+
+ # So that Property::Locators::Iterator can identify this class
+ # as an iterator property.
+ # TODO: Called by NodeSelector. Fix this
+ def format
+ :container
+ end
+
+ def namespaces
+ # TODO: Called by NodeSelector. Fix this
+ nil
+ end
+ end
+ end
+end
View
5 lib/wombat/follower.rb
@@ -1,5 +0,0 @@
-module Wombat
- class Follower < Iterator
-
- end
-end
View
43 lib/wombat/iterator.rb
@@ -1,43 +0,0 @@
-module Wombat
- # Each iterator property keeps an array
- # with the results of each iteration pass.
- class Iterator < PropertyContainer
- attr_accessor :selector
-
- def initialize(selector)
- @selector = selector
- super()
- end
-
- def parse
- raise ArgumentError.new('Must provide a block to locate property values') unless block_given?
-
- all_properties.each do |p|
- p.result ||= []
- result = yield p
- if result
- result = p.callback ? p.callback.call(result) : result
- p.result << result
- end
- end
- end
-
- def reset
- all_properties.each { |p| p.reset }
- end
-
- def flatten(depth = nil)
- # Determine the iterator array length by the biggest property result array that we have
- length = all_properties.map(&:result).sort { |a| a.length }.last.size
-
- # Allocate an array and fall back to default
- # flatten implementation to fill the resulting hash
- # based on the current property depth.
- Array.new.tap do |a|
- length.times do |i|
- a << super(i)
- end
- end
- end
- end
-end
View
24 lib/wombat/metadata.rb
@@ -1,24 +0,0 @@
-#coding: utf-8
-require 'wombat/property_container'
-require 'wombat/iterator'
-
-module Wombat
- class Metadata < PropertyContainer
- def initialize
- self[:document_format] = :html
- super
- end
-
- def base_url(url)
- self[:base_url] = url
- end
-
- def list_page(url)
- self[:list_page] = url
- end
-
- def document_format(format)
- self[:document_format] = format
- end
- end
-end
View
10 lib/wombat/node_selector.rb
@@ -1,10 +0,0 @@
-module Wombat
- module NodeSelector
- def select_nodes(selector, namespaces = nil)
- return [selector.to_s] if selector.is_a? Symbol
- return context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
- return context.css selector[4..-1] if selector.start_with? "css="
- [selector]
- end
- end
-end
View
59 lib/wombat/parser.rb
@@ -1,59 +0,0 @@
-#coding: utf-8
-require 'wombat/property_locator'
-require 'mechanize'
-require 'restclient'
-
-module Wombat
- module Parser
- include PropertyLocator
- attr_accessor :mechanize, :context, :response_code, :page
-
- def initialize
- @mechanize = Mechanize.new
- end
-
- def parse(metadata)
- @context = parser_for metadata
- original_context = @context
-
- metadata.iterators.each do |it|
- it.reset # Clean up iterator results before starting
- select_nodes(it.selector).each do |node|
- @context = node
- it.parse { |p| locate p }
- end
- end
-
- @context = original_context
-
- metadata.parse { |p| locate p }
-
- metadata.flatten
- end
-
- private
- def parser_for(metadata)
- url = "#{metadata[:base_url]}#{metadata[:list_page]}"
- page = nil
- parser = nil
- begin
- if metadata[:document_format] == :html
- @page = @mechanize.get(url)
- parser = @page.parser
- else
- @page = RestClient.get(url)
- parser = Nokogiri::XML @page
- end
- @response_code = @page.code.to_i if @page.respond_to? :code
- parser
- rescue
- if $!.respond_to? :http_code
- @response_code = $!.http_code.to_i
- elsif $!.respond_to? :response_code
- @response_code = $!.response_code.to_i
- end
- raise $!
- end
- end
- end
-end
View
12 lib/wombat/processing/node_selector.rb
@@ -0,0 +1,12 @@
+module Wombat
+ module Processing
+ module NodeSelector
+ def select_nodes(selector, namespaces = nil)
+ return [selector.to_s] if selector.is_a? Symbol
+ return @context.xpath selector[6..-1], namespaces if selector.start_with? "xpath="
+ return @context.css selector[4..-1] if selector.start_with? "css="
+ [selector]
+ end
+ end
+ end
+end
View
47 lib/wombat/processing/parser.rb
@@ -0,0 +1,47 @@
+#coding: utf-8
+require 'wombat/property/locators/factory'
+require 'mechanize'
+require 'restclient'
+
+module Wombat
+ module Processing
+ module Parser
+ attr_accessor :mechanize, :context, :response_code, :page
+
+ def initialize
+ @mechanize = Mechanize.new
+ end
+
+ def parse(metadata)
+ @context = parser_for metadata
+
+ Wombat::Property::Locators::Factory.locator_for(metadata, @context).locate
+ end
+
+ private
+ def parser_for(metadata)
+ url = "#{metadata[:base_url]}#{metadata[:list_page]}"
+ page = nil
+ parser = nil
+ begin
+ if metadata[:document_format] == :html
+ @page = @mechanize.get(url)
+ parser = @page.parser
+ else
+ @page = RestClient.get(url)
+ parser = Nokogiri::XML @page
+ end
+ @response_code = @page.code.to_i if @page.respond_to? :code
+ parser
+ rescue
+ if $!.respond_to? :http_code
+ @response_code = $!.http_code.to_i
+ elsif $!.respond_to? :response_code
+ @response_code = $!.response_code.to_i
+ end
+ raise $!
+ end
+ end
+ end
+ end
+end
View
21 lib/wombat/property.rb
@@ -1,21 +0,0 @@
-module Wombat
- class Property
- attr_accessor :name, :selector, :format, :namespaces, :callback, :result
-
- def initialize(options)
- @name = options[:name]
- @selector = options[:selector]
- @format = options[:format] || :text
- @namespaces = options[:namespaces]
- @callback = options[:callback]
- end
-
- def flatten(depth = nil)
- depth ? result[depth] : result
- end
-
- def reset
- self.result = nil
- end
- end
-end
View
30 lib/wombat/property/locators/base.rb
@@ -0,0 +1,30 @@
+#coding: utf-8
+require 'wombat/processing/node_selector'
+
+module Wombat
+ module Property
+ module Locators
+ # Abstract base class
+ class Base
+ include Wombat::Processing::NodeSelector
+
+ def initialize(property, context)
+ @property = property
+ @context = context
+ end
+
+ def locate
+ raw_data = yield if block_given?
+ data = @property.respond_to?(:callback) && @property.callback ? @property.callback.call(raw_data) : raw_data
+
+ @property.name ? { @property.name => data } : data
+ end
+
+ protected
+ def locate_nodes
+ select_nodes @property.selector, @property.namespaces
+ end
+ end
+ end
+ end
+end
View
39 lib/wombat/property/locators/factory.rb
@@ -0,0 +1,39 @@
+#coding: utf-8
+require 'wombat/property/locators/base'
+require 'wombat/property/locators/follow'
+require 'wombat/property/locators/html'
+require 'wombat/property/locators/iterator'
+require 'wombat/property/locators/property_container'
+require 'wombat/property/locators/list'
+require 'wombat/property/locators/text'
+
+class Wombat::Property::Locators::UnknownTypeException < Exception; end;
+
+module Wombat
+ module Property
+ module Locators
+ module Factory
+ def self.locator_for(property, context)
+ klass = case(property.format)
+ when :text
+ Text
+ when :list
+ List
+ when :html
+ Html
+ when :iterator
+ Iterator
+ when :container
+ PropertyContainer
+ when :follow
+ Follow
+ else
+ raise Wombat::Property::Locators::UnknownTypeException.new("Unknown property format #{property.format}.")
+ end
+
+ klass.new(property, context)
+ end
+ end
+ end
+ end
+end
View
12 lib/wombat/property/locators/follow.rb
@@ -0,0 +1,12 @@
+#coding: utf-8
+
+module Wombat
+ module Property
+ module Locators
+ class Follow < Base
+ def locate
+ end
+ end
+ end
+ end
+end
View
14 lib/wombat/property/locators/html.rb
@@ -0,0 +1,14 @@
+#coding: utf-8
+
+module Wombat
+ module Property
+ module Locators
+ class Html < Base
+ def locate
+ node = locate_nodes.first
+ super { node.inner_html.strip }
+ end
+ end
+ end
+ end
+end
View
23 lib/wombat/property/locators/iterator.rb
@@ -0,0 +1,23 @@
+#coding: utf-8
+require 'wombat/property/locators/property_container'
+
+module Wombat
+ module Property
+ module Locators
+ class Iterator < Base
+ def locate
+ super do
+ locate_nodes.flat_map do |node|
+ Hash.new.tap do |h|
+ @property.values
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyContainer) }
+ .map { |p| Factory.locator_for(p, node).locate }
+ .map { |p| h.merge! p }
+ end
+ end
+ end
+ end
+ end
+ end
+ end
+end
View
17 lib/wombat/property/locators/list.rb
@@ -0,0 +1,17 @@
+#coding: utf-8
+
+module Wombat
+ module Property
+ module Locators
+ class List < Base
+ def locate
+ super do
+ locate_nodes.map do |n|
+ n.is_a?(String) ? n.strip : n.inner_text.strip
+ end
+ end
+ end
+ end
+ end
+ end
+end
View
20 lib/wombat/property/locators/property_container.rb
@@ -0,0 +1,20 @@
+#coding: utf-8
+
+module Wombat
+ module Property
+ module Locators
+ class PropertyContainer < Base
+ def locate
+ super do
+ Hash.new.tap do |h|
+ @property.values
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyContainer) }
+ .map { |p| Factory.locator_for(p, @context).locate }
+ .map { |p| h.merge! p }
+ end
+ end
+ end
+ end
+ end
+ end
+end
View
22 lib/wombat/property/locators/text.rb
@@ -0,0 +1,22 @@
+#coding: utf-8
+
+module Wombat
+ module Property
+ module Locators
+ class Text < Base
+ def locate
+ node = locate_nodes.first
+
+ value =
+ unless node
+ nil
+ else
+ node.is_a?(String) ? node.strip : node.inner_text.strip
+ end
+
+ super { value }
+ end
+ end
+ end
+ end
+end
View
70 lib/wombat/property_container.rb
@@ -1,70 +0,0 @@
-#coding: utf-8
-
-module Wombat
- class PropertyContainer < Hash
- attr_accessor :iterators
-
- def initialize
- @iterators = []
- end
-
- def method_missing(method, *args, &block)
- if args.empty? && block
- self["#{method.to_s}"] = PropertyContainer.new unless self["#{method.to_s}"]
- block.call(self["#{method.to_s}"])
- else
- self[method.to_s] = Property.new(
- name: method.to_s,
- selector: args.first,
- format: args[1],
- namespaces: args[2],
- callback: block)
- end
- end
-
- def to_ary
- end
-
- def all_properties
- values.flat_map { |v|
- if v.kind_of? PropertyContainer
- v.all_properties
- elsif v.kind_of? Property
- v
- else
- nil
- end
- }.compact
- end
-
- def parse
- all_properties.each do |p|
- result = yield p if block_given?
- p.result = p.callback ? p.callback.call(result) : result
- end
- end
-
- def flatten(depth = nil)
- properties = Hash.new.tap do |h|
- keys.map do |k|
- val = self[k]
- if val.is_a?(PropertyContainer) || val.is_a?(Property)
- h[k] = val.flatten depth
- end
- end
- end
-
- iters = iterators.reduce({}) do |memo, i|
- memo.merge("iterator#{iterators.index(i)}" => i.flatten)
- end
-
- properties.merge iters
- end
-
- def for_each(selector)
- Iterator.new(selector).tap do |i|
- iterators << i
- end
- end
- end
-end
View
32 lib/wombat/property_locator.rb
@@ -1,32 +0,0 @@
-#coding: utf-8
-require 'wombat/node_selector'
-
-module Wombat
- class PropertyLocatorException < Exception; end;
-
- module PropertyLocator
- include NodeSelector
-
- SUPPORTED_PROPERTY_TYPES = [:text, :html, :list, :follow]
-
- def locate(property)
- raise Wombat::PropertyLocatorException.new("Unknown property format #{property.format}: #{property.name}") unless SUPPORTED_PROPERTY_TYPES.include?(property.format)
-
- props = _locate property
- property.format != :list ? props.first : props
- end
-
- private
-
- def _locate(property)
- result = select_nodes(property.selector, property.namespaces).to_a
-
- if property.format == :follow
- result.each { |r| p r }
- end
-
- result.map! {|r| r.inner_html.strip } if property.format == :html
- result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
- end
- end
-end
View
25 spec/crawler_spec.rb
@@ -30,7 +30,10 @@
e.time Time.now
end
- @crawler.venue { |v| v.name "Scooba" }
+ @crawler.venue do |v|
+ v.name "Scooba"
+ end
+
@crawler.location { |v| v.latitude -50.2323 }
@crawler_instance.should_receive(:parse) do |arg|
@@ -93,26 +96,6 @@
@crawler.event
end
- it 'should iterate on elements inside for_each block' do
- @crawler.for_each "css=.element" do
- title "css=.title"
- body "css=.body"
- event do |e|
- e.all "yeah"
- end
- end
-
- @crawler_instance.should_receive(:parse) do |arg|
- it = arg.iterators.first
- it.selector.should == "css=.element"
- it["title"].selector.should == "css=.title"
- it["body"].selector.should == "css=.body"
- it["event"]["all"].selector.should == "yeah"
- end
-
- @crawler_instance.crawl
- end
-
it 'should assign metadata format' do
@crawler_instance.should_receive(:parse) do |arg|
arg[:document_format].should == :xml
View
4 spec/property_spec.rb → spec/dsl/property_spec.rb
@@ -1,8 +1,8 @@
require 'spec_helper'
-describe Wombat::Property do
+describe Wombat::DSL::Property do
it 'should store property data' do
- property = Wombat::Property.new(
+ property = Wombat::DSL::Property.new(
name: "title",
selector: "/some/selector",
format: :html,
View
37 spec/integration/integration_spec.rb
@@ -15,7 +15,7 @@
s.twitter "css=.ctn-bar li.last"
end
- crawler.for_each "css=.ctn-links" do
+ crawler.links "css=.ctn-links", :iterator do
menu "css=a"
end
@@ -28,7 +28,7 @@
results = crawler_instance.crawl
results["search"].should == "Buscar"
- results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
results["subheader"].should == "Londres 2012"
results["social"]["twitter"].should == "Verão"
end
@@ -41,7 +41,7 @@
crawler.base_url "http://www.terra.com.br"
crawler.list_page '/portal'
- crawler.for_each "css=.ctn-links" do
+ crawler.links "css=.ctn-links", :iterator do
menu "css=a"
end
@@ -53,13 +53,13 @@
results = crawler_instance.crawl
end
- results["iterator0"].should == result_hash
+ results["links"].should == result_hash
VCR.use_cassette('basic_crawler_page') do
results = crawler_instance.crawl
end
- results["iterator0"].should == result_hash
+ results["links"].should == result_hash
end
it 'should crawl page through block to class instance crawl method' do
@@ -77,7 +77,7 @@
s.twitter "css=.ctn-bar li.last"
end
- for_each "css=.ctn-links" do
+ links "css=.ctn-links", :iterator do
menu "css=a"
end
@@ -87,7 +87,7 @@
end
results["search"].should == "Buscar"
- results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
results["subheader"].should == "Londres 2012"
results["social"]["twitter"].should == "Verão"
end
@@ -105,7 +105,7 @@
s.twitter "css=.ctn-bar li.last"
end
- for_each "css=.ctn-links" do
+ links "css=.ctn-links", :iterator do
menu "css=a"
end
@@ -115,7 +115,7 @@
end
results["search"].should == "Buscar"
- results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
+ results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
results["subheader"].should == "Londres 2012"
results["social"]["twitter"].should == "Verão"
end
@@ -129,22 +129,22 @@
crawler.base_url "https://www.github.com"
crawler.list_page "/explore"
- crawler.for_each "css=ol.ranked-repositories li" do
+ crawler.repos "css=ol.ranked-repositories>li", :iterator do
project do |p|
p.repo 'css=h3'
- p.description('css=p.description') { |d| d.gsub(/for/, '') }
+ p.description('css=p.description') { |d| d ? d.gsub(/for/, '') : nil }
end
end
- crawler_instance = crawler.new
- results = crawler_instance.crawl
+ results = crawler.new.crawl
- results.should == { "iterator0" => [
+ results.should == { "repos" => [
{ "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } },
{ "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } },
{ "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } },
{ "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } },
- { "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } }
+ { "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } },
+ { "project" => { "repo" => nil, "description" => nil}}
]}
end
end
@@ -160,14 +160,14 @@
crawler.artist "xpath=//title", :list
- crawler.for_each 'xpath=//event' do
+ crawler.location 'xpath=//event', :iterator do
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
end
crawler_instance = crawler.new
results = crawler_instance.crawl
- iterator = results['iterator0']
+ iterator = results['location']
iterator.should == [
{"latitude"=>"37.807775", "longitude"=>"-122.272736"},
@@ -187,7 +187,6 @@
end
it 'should follow links' do
- pending('Not implemented yet.')
VCR.use_cassette('follow_links') do
crawler = Class.new
crawler.send(:include, Wombat::Crawler)
@@ -197,7 +196,7 @@
crawler.list_page "/"
crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do
- heading 'css=h1'
+ #heading 'css=h1'
end
crawler_instance = crawler.new
View
52 spec/iterator_spec.rb
@@ -1,52 +0,0 @@
-require 'spec_helper'
-
-describe Wombat::Iterator do
- let(:it) { Wombat::Iterator.new "it_selector" }
-
- context 'parse' do
- it 'should iterate in for_each properties' do
- it.prop_1 "some_selector"
- it.prop_2 "another_selector"
-
- it['prop_1'].should_receive(:result).twice.and_return([])
- it['prop_2'].should_receive(:result).twice.and_return([])
-
- parser = double :parser
- parser.should_receive(:locate).with(it['prop_1']).twice
- parser.should_receive(:locate).with(it['prop_2']).twice
-
- it.parse { |p| parser.locate p }
- it.parse { |p| parser.locate p }
- end
-
- it 'should raise if no block given' do
- expect{
- it.parse
- }.to raise_error(ArgumentError)
- end
- end
-
- context 'reset' do
- it 'should clean up properties results' do
- it.prop_1 'some_selector'
- it['prop_1'].result = [1, 2]
- it.reset
- it['prop_1'].result.should be_nil
- end
- end
-
- it 'should flatten properties to plain hash format' do
- it.prop_1 "some_selector"
- it.prop_2 "another_selector"
-
- it.parse {|p| }
- it.parse {|p| }
- it['prop_1'].result = ['result 1', 'result 2']
- it['prop_2'].result = ['result 3', 'result 4']
-
- it.flatten.should == [
- { "prop_1" => "result 1", "prop_2" => "result 3" },
- { "prop_1" => "result 2", "prop_2" => "result 4" }
- ]
- end
-end
View
20 spec/metadata_spec.rb
@@ -1,20 +0,0 @@
-require 'spec_helper'
-
-describe Wombat::Metadata do
- before(:each) do
- @metadata = Wombat::Metadata.new
- end
-
- it 'should not include non-properties in all properties list' do
- @metadata.another_property "/some/selector", :text
- @metadata.base_url "felipecsl.com"
- @metadata.list_page "/yeah"
- @metadata.all_properties.should == [@metadata['another_property']]
- end
-
- it 'should store iterators' do
- @metadata.for_each("some_selector").kind_of?(Wombat::Iterator).should be_true
- @metadata.iterators.size.should == 1
- @metadata.iterators.first.selector.should == "some_selector"
- end
-end
View
18 spec/parser_spec.rb → spec/processing/parser_spec.rb
@@ -1,11 +1,11 @@
require 'spec_helper'
-describe Wombat::Parser do
+describe Wombat::Processing::Parser do
before(:each) do
crawler = Class.new
- crawler.send(:include, Wombat::Parser)
+ crawler.send(:include, Wombat::Processing::Parser)
@parser = crawler.new
- @metadata = Wombat::Metadata.new
+ @metadata = Wombat::DSL::Metadata.new
end
it 'should request page document with correct url' do
@@ -72,18 +72,6 @@
block_called.should be_true
end
- it 'should return hash with requested properties' do
- hash = double :results
- fake_parser = double :parser
- fake_document = double :document
-
- fake_document.should_receive(:parser).and_return fake_parser
- @parser.mechanize.stub(:get).and_return fake_document
- @metadata.should_receive(:flatten).and_return hash
-
- @parser.parse(@metadata).should == hash
- end
-
it 'should not include null results in iterated block' do
fake_parser = double :parser
fake_document = double :document
View
18 spec/property/locators/factory_spec.rb
@@ -0,0 +1,18 @@
+require 'spec_helper'
+
+describe Wombat::Property::Locators::Factory do
+ it 'should instantiate correct locator according to property type' do
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :text), nil).should be_a(Wombat::Property::Locators::Text)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :html), nil).should be_a(Wombat::Property::Locators::Html)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :list), nil).should be_a(Wombat::Property::Locators::List)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :follow), nil).should be_a(Wombat::Property::Locators::Follow)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :iterator), nil).should be_a(Wombat::Property::Locators::Iterator)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :container), nil).should be_a(Wombat::Property::Locators::PropertyContainer)
+ end
+
+ it 'should raise correct exception if provided property is of unknown type' do
+ lambda {
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :weird), nil)
+ }.should raise_error(Wombat::Property::Locators::UnknownTypeException, "Unknown property format weird.")
+ end
+end
View
4 spec/property/locators/follow_spec.rb
@@ -0,0 +1,4 @@
+require 'spec_helper'
+
+describe Wombat::Property::Locators::Follow do
+end
View
15 spec/property/locators/html_spec.rb
@@ -0,0 +1,15 @@
+require 'spec_helper'
+
+describe Wombat::Property::Locators::Html do
+ it 'should locate html property' do
+ fake_elem = double :element
+ context = double :context
+ fake_elem.stub inner_html: "Something cool "
+ context.stub(:xpath).with("/abc", nil).and_return [fake_elem]
+ property = Wombat::DSL::Property.new(name: 'data1', selector: 'xpath=/abc', format: :html)
+
+ locator = Wombat::Property::Locators::Html.new(property, context)
+
+ locator.locate.should == { "data1" => "Something cool" }
+ end
+end
View
4 spec/property/locators/iterator_spec.rb
@@ -0,0 +1,4 @@
+require 'spec_helper'
+
+describe Wombat::Property::Locators::Iterator do
+end
View
13 spec/property/locators/list_spec.rb
@@ -0,0 +1,13 @@
+require 'spec_helper'
+
+describe Wombat::Property::Locators::List do
+ it 'should locate a list of nodes' do
+ context = double :context
+ context.stub(:css).with(".selector").and_return %w(1 2 3 4 5)
+ property = Wombat::DSL::Property.new(name: 'data1', selector: 'css=.selector', format: :list)
+
+ locator = Wombat::Property::Locators::List.new(property, context)
+
+ locator.locate.should == { "data1" => %w(1 2 3 4 5) }
+ end
+end
View
49 spec/property/locators/text_spec.rb
@@ -0,0 +1,49 @@
+require 'spec_helper'
+
+describe Wombat::Property::Locators::Text do
+ it 'should locate text property with xpath selector and namespaces' do
+ fake_elem = double :element
+ context = double :context
+ fake_elem.stub inner_text: "Something cool "
+ context.stub(:xpath).with("/abc", 'boom').and_return [fake_elem]
+ property = Wombat::DSL::Property.new(name: 'data1', selector: 'xpath=/abc', namespaces: 'boom', format: :text)
+
+ locator = Wombat::Property::Locators::Text.new(property, context)
+
+ locator.locate.should == { "data1" => "Something cool" }
+ end
+
+ it 'should locate text property with css selector' do
+ fake_elem = double :element
+ context = double :context
+ fake_elem.stub inner_text: "My name"
+ context.stub(:css).with("/def").and_return [fake_elem]
+ property = Wombat::DSL::Property.new(name: 'data1', selector: 'css=/def', format: :text)
+
+ locator = Wombat::Property::Locators::Text.new(property, context)
+
+ locator.locate.should == { "data1" => "My name" }
+ end
+
+ it 'should return plain symbols as strings' do
+ fake_elem = double :element
+ context = double :context
+ property = Wombat::DSL::Property.new(name: 'data_2', selector: :hardcoded_value, format: :text)
+
+ locator = Wombat::Property::Locators::Text.new(property, context)
+
+ locator.locate.should == { "data_2" => "hardcoded_value" }
+ end
+
+ it 'should invoke property callback' do
+ fake_elem = double :element
+ context = double :context
+ fake_elem.stub inner_text: "My name"
+ context.stub(:css).with("/def").and_return [fake_elem]
+ property = Wombat::DSL::Property.new(name: 'data1', selector: 'css=/def', format: :text, callback: Proc.new { |s| s.gsub(/name/, 'ass') })
+
+ locator = Wombat::Property::Locators::Text.new(property, context)
+
+ locator.locate.should == { "data1" => "My ass" }
+ end
+end
View
62 spec/property_container_spec.rb
@@ -1,62 +0,0 @@
-require 'spec_helper'
-
-describe Wombat::PropertyContainer do
- before(:each) do
- @metadata = Wombat::PropertyContainer.new
- end
-
- it 'should return an array with all the metadata properties excluding iterators' do
- @metadata["event"] = Wombat::PropertyContainer.new
- @metadata["venue"] = Wombat::PropertyContainer.new
- @metadata.another_property "/some/selector", :text
- @metadata["event"]["something"] = Wombat::PropertyContainer.new
- @metadata["event"]["something"].else "Wohooo"
- @metadata["venue"].awesome "whooea"
- it = Wombat::Iterator.new "it_selector"
- it.felipe "lima"
- @metadata.iterators << it
-
- all_propes = @metadata.all_properties
-
- all_propes.should =~ [
- @metadata["another_property"],
- @metadata["event"]["something"]["else"],
- @metadata["venue"]["awesome"]
- ]
- end
-
- it 'should be able to change properties via all_properties' do
- @metadata.another_property "/some/selector", :text
- @metadata.all_properties.first.selector = "abc"
- @metadata["another_property"].selector.should == "abc"
- end
-
- it 'should return metadata in plain hash format including iterators' do
- @metadata.title "/some/selector"
- @metadata["title"].result = "Gogobot Inc."
- @metadata["holder"] = Wombat::PropertyContainer.new
- @metadata["holder"].heading "css=.heading"
- @metadata["holder"]["heading"].result = 123456
- @metadata["holder"]["subheader"] = Wombat::PropertyContainer.new
- @metadata["holder"]["subheader"].section "/blah"
- @metadata["holder"]["subheader"]["section"].result = "Lorem Ipsum"
- it = Wombat::Iterator.new "it_selector"
- it.felipe "lima"
- it["felipe"].result = ["correa", "de souza", "lima"]
- @metadata.iterators = [it]
- @metadata.footer("another thing", :html) { |a| true }
- @metadata["footer"].result = "bla bla bla"
-
- @metadata.flatten.should == {
- "title" => "Gogobot Inc.",
- "holder" => {
- "heading" => 123456,
- "subheader" => {
- "section" => "Lorem Ipsum"
- }
- },
- "iterator0"=>[{"felipe"=>"correa"}, {"felipe"=>"de souza"}, {"felipe"=>"lima"}],
- "footer" => "bla bla bla"
- }
- end
-end
View
75 spec/property_locator_spec.rb
@@ -1,75 +0,0 @@
-require 'spec_helper'
-
-describe Wombat::PropertyLocator do
- before(:each) do
- @locator = Class.new
- @locator.send(:include, Wombat::PropertyLocator)
- @locator_instance = @locator.new
- @metadata = Wombat::Metadata.new
- @metadata["event"] = Wombat::PropertyContainer.new
- @metadata["venue"] = Wombat::PropertyContainer.new
- @metadata["location"] = Wombat::PropertyContainer.new
- end
-
- it 'should locate metadata properties' do
- context = double :context
- abc = double :abc
-
- abc.stub(:inner_text).and_return("Something cool")
-
- context.stub(:xpath).with("/abc", nil).and_return([abc])
- context.stub(:xpath).with("/bah", nil).and_return(["abc"])
- context.stub(:css).with("/ghi").and_return(["Another stuff"])
-
- @metadata["event"].data1 "xpath=/abc"
- @metadata["venue"].data2 :farms
- @metadata["location"].data3 "css=/ghi"
- @metadata.blah "xpath=/bah"
-
- @locator_instance.stub(:context).and_return context
-
- @metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
-
- @metadata["blah"].result.should == "abc"
- @metadata["event"]["data1"].result.should == "Something cool"
- @metadata["venue"]["data2"].result.should == "farms"
- @metadata["location"]["data3"].result.should == "Another stuff"
- end
-
- it 'should support properties with html format' do
- context = double :context
- html_info = double :html_info
-
- html_info.should_receive(:inner_html).and_return("some another info ")
- context.should_receive(:xpath).with("/anotherData", nil).and_return([html_info])
-
- @locator_instance.stub(:context).and_return context
-
- @metadata["event"].another_info "xpath=/anotherData", :html
-
- @metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
-
- @metadata["event"]["another_info"].result.should == "some another info"
- end
-
- it 'should trim property contents and use namespaces if present' do
- context = double :context
- context.should_receive(:xpath).with("/event/some/description", "blah").and_return([" awesome event "])
-
- @locator_instance.stub(:context).and_return context
- @metadata["event"].description "xpath=/event/some/description", :text, "blah"
-
- @metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
-
- @metadata["event"]["description"].result.should == "awesome event"
- end
-
- it 'should return array of matching nodes for list properties' do
- context = double :context
- @metadata.list_prop "css=.selector", :list
- @locator_instance.stub(:context).and_return context
- @locator_instance.should_receive(:select_nodes).with("css=.selector", nil).and_return %w(1 2 3 4 5)
-
- @locator_instance.locate(@metadata["list_prop"]).should == %w(1 2 3 4 5)
- end
-end

No commit comments for this range

Something went wrong with that request. Please try again.