Skip to content

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also .

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also .
...
  • 17 commits
  • 34 files changed
  • 0 commit comments
  • 1 contributor
View
21 README.md
@@ -19,29 +19,25 @@ Obs: Requires ruby 1.9
The simplest way to use Wombat is by calling ``Wombat.crawl`` and passing it a block:
```ruby
-
-# => github_scraper.rb
-
-#coding: utf-8
require 'wombat'
Wombat.crawl do
base_url "http://www.github.com"
- list_page "/"
+ path "/"
headline "xpath=//h1"
-
what_is "css=.column.secondary p", :html
+ repositories "css=a.repo", :list
explore "xpath=//ul/li[2]/a" do |e|
e.gsub(/Explore/, "LOVE")
end
- benefits do |b|
- b.first_benefit "css=.column.leftmost h3"
- b.second_benefir "css=.column.leftmid h3"
- b.third_benefit "css=.column.rightmid h3"
- b.fourth_benefit "css=.column.rightmost h3"
+ benefits do
+ first_benefit "css=.column.leftmost h3"
+ second_benefit "css=.column.leftmid h3"
+ third_benefit "css=.column.rightmid h3"
+ fourth_benefit "css=.column.rightmost h3"
end
end
```
@@ -53,6 +49,7 @@ end
"headline" => "1,316,633 people hosting over 3,951,378 git repositories",
"what_is" => "GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
"explore" => "LOVE GitHub",
+ "repositories" => ["jQuery", "reddit", "Sparkle", "curl", "Ruby on Rails", "node.js", "ClickToFlash", "Erlang/OTP", "CakePHP", "Redis"]
"benefits" => {
"first_benefit" => "Team management",
"second_benefit" => "Code review",
@@ -63,7 +60,7 @@ end
```
### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the [project Wiki](http://github.com/felipecsl/wombat/wiki).
-### [API Documentation](http://rubydoc.info/gems/wombat/1.0.0/frames)
+### [API Documentation](http://rubydoc.info/gems/wombat/2.0.0/frames)
### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
View
2 VERSION
@@ -1 +1 @@
-1.0.0
+2.0.0
View
46 examples/iterator.rb
@@ -0,0 +1,46 @@
+#coding: utf-8
+require 'wombat'
+
+class IteratorCrawler
+ include Wombat::Crawler
+
+ base_url "https://www.github.com"
+ path "/explore"
+
+ repos "css=ol.ranked-repositories>li", :iterator do
+ repo 'css=h3'
+ description 'css=p.description'
+ end
+end
+
+=begin
+p IteratorCrawler.new.crawl
+{"repos"=>
+ [
+ {
+ "repo"=>"bernii / gauge.js",
+ "description"=>"100% native and cool looking JavaScript gauge"
+ },
+ {
+ "repo"=>"ZeitOnline / briefkasten",
+ "description"=>"a reasonably secure web application for submitting content anonymously"
+ },
+ {
+ "repo"=>"nothingmagical / cheddar-ios",
+ "description"=>"Cheddar for iOS"
+ },
+ {
+ "repo"=>"nathanmarz / storm-mesos",
+ "description"=>"Run Storm on top of the Mesos cluster resource manager"
+ },
+ {
+ "repo"=>"Netflix / SimianArmy",
+ "description"=>"Tools for keeping your cloud operating in top form. Chaos Monkey is a resiliency tool that helps ..."
+ },
+ {
+ "repo"=>nil,
+ "description"=>nil
+ }
+ ]
+}
+=end
View
44 examples/list.rb
@@ -0,0 +1,44 @@
+#coding: utf-8
+require 'wombat'
+
+class ListCrawler
+ include Wombat::Crawler
+
+ base_url "http://www.rubygems.org"
+ path "/"
+
+ gems do
+ new "css=#new_gems li", :list
+ most_downloaded "css=#most_downloaded li", :list
+ just_updated "css=#just_updated li", :list
+ end
+end
+
+=begin
+pp ListCrawler.new.crawl
+{
+ "gems"=>{
+ "new"=>[
+ "buffer (0.0.1)",
+ "resque-telework (0.2.0)",
+ "my_string_extend_lyk (0.0.1)",
+ "specr (0.0.1)",
+ "array-frequency (1.0.0)"
+ ],
+ "most_downloaded"=> [
+ "rake-0.9.2.2 (7,128)",
+ "mime-types-1.19 (5,331)",
+ "tilt-1.3.3 (5,146)",
+ "rack-1.4.1 (5,124)",
+ "multi_json-1.3.6 (5,093)"
+ ],
+ "just_updated"=>[
+ "wombat (2.0.0)",
+ "pdf-reader-turtletext (0.2.1)",
+ "minitest-reporters (0.10.0)",
+ "cloudprint (0.1.3)",
+ "greenletters (0.2.0)"
+ ]
+ }
+}
+=end
View
41 examples/no_class.rb
@@ -0,0 +1,41 @@
+#coding: utf-8
+require 'wombat'
+
+data = Wombat.crawl do
+ base_url "http://www.github.com"
+ path "/"
+
+ headline "xpath=//h1"
+ what_is "css=.column.secondary p", :html
+
+ explore "xpath=//ul/li[2]/a" do |e|
+ e.gsub(/Explore/, "LOVE")
+ end
+
+ benefits do
+ team_mgmt "css=.column.leftmost h3"
+ code_review "css=.column.leftmid h3"
+ hosting "css=.column.rightmid h3"
+ collaboration "css=.column.rightmost h3"
+
+ links do
+ team_mgmt "xpath=//div[@class='column leftmost']//a/@href"
+ end
+ end
+end
+
+=begin
+pp data
+{
+ "headline"=>"1,900,094\n people hosting over\n 3,371,168\n repositories",
+ "what_is"=>"GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
+ "explore"=>"LOVE GitHub",
+ "benefits"=> {
+ "team_mgmt"=>"Team management",
+ "code_review"=>"Code review",
+ "hosting"=>"Reliable code hosting",
+ "collaboration"=>"Open source collaboration",
+ "links"=>{"team_mgmt"=>"/features/projects/collaboration"}
+ }
+}
+=end
View
38 examples/xml.rb
@@ -0,0 +1,38 @@
+#coding: utf-8
+require 'wombat'
+
+class XmlCrawler
+ include Wombat::Crawler
+
+ base_url "http://ws.audioscrobbler.com"
+ path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=ENV['API_KEY']"
+
+ document_format :xml
+
+ title "xpath=//event/title"
+
+ locations 'xpath=//event', :iterator do
+ latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
+ longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
+ end
+end
+
+=begin
+pp XmlCrawler.new.crawl
+
+{
+ "title"=>"Sinéad O'Connor",
+ "locations"=>[
+ {"latitude"=>"37.807717", "longitude"=>"-122.270059"},
+ {"latitude"=>"37.76213", "longitude"=>"-122.419032"},
+ {"latitude"=>"37.771491", "longitude"=>"-122.413241"},
+ {"latitude"=>"37.776227", "longitude"=>"-122.42044"},
+ {"latitude"=>"37.766588", "longitude"=>"-122.430391"},
+ {"latitude"=>"37.788978", "longitude"=>"-122.40664"},
+ {"latitude"=>"37.769715", "longitude"=>"-122.420427"},
+ {"latitude"=>"37.78832", "longitude"=>"-122.446692"},
+ {"latitude"=>"37.787583", "longitude"=>"-122.421665"},
+ {"latitude"=>"37.776227", "longitude"=>"-122.42044"}
+ ]
+}
+=end
View
2,186 fixtures/vcr_cassettes/follow_links.yml
2,041 additions, 145 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
15 lib/wombat/dsl/follower.rb
@@ -1,17 +1,18 @@
module Wombat
module DSL
- class Follower < PropertyContainer
- attr_accessor :name, :selector
+ class Follower < PropertyGroup
+ attr_accessor :wombat_property_selector
def initialize(name, selector)
- @name = name
- @selector = selector
+ @wombat_property_selector = selector
- # Explicitly send 0 arguments to superclass constructor
- super()
+ super(name)
end
- def parse(context)
+ # So that Property::Locators::Iterator can identify this class
+ # as an iterator property.
+ def wombat_property_format
+ :follow
end
end
end
View
11 lib/wombat/dsl/iterator.rb
@@ -1,20 +1,17 @@
-require 'wombat/processing/node_selector'
-
module Wombat
module DSL
- class Iterator < PropertyContainer
- attr_accessor :name, :selector
+ class Iterator < PropertyGroup
+ attr_accessor :wombat_property_selector
def initialize(name, selector)
- @selector = selector
+ @wombat_property_selector = selector
- # Explicitly send 0 arguments to superclass constructor
super(name)
end
# So that Property::Locators::Iterator can identify this class
# as an iterator property.
- def format
+ def wombat_property_format
:iterator
end
end
View
9 lib/wombat/dsl/metadata.rb
@@ -1,10 +1,11 @@
#coding: utf-8
-require 'wombat/dsl/property_container'
+require 'wombat/dsl/property_group'
require 'wombat/dsl/iterator'
+require 'wombat/dsl/follower'
module Wombat
module DSL
- class Metadata < PropertyContainer
+ class Metadata < PropertyGroup
def initialize
self[:document_format] = :html
super
@@ -14,8 +15,8 @@ def base_url(url)
self[:base_url] = url
end
- def list_page(url)
- self[:list_page] = url
+ def path(url)
+ self[:path] = url
end
def document_format(format)
View
29 lib/wombat/dsl/property.rb
@@ -1,17 +1,26 @@
module Wombat
module DSL
class Property
- attr_accessor :name, :selector, :format, :namespaces, :callback
+ attr_accessor :wombat_property_name, :wombat_property_selector, :wombat_property_format, :wombat_property_namespaces, :callback
- # TODO: This class should receive method_name, args and block
- # and do the assignment of properties itself, instead of receiving
- # an options hash.
- def initialize(options)
- @name = options[:name]
- @selector = options[:selector]
- @format = options[:format] || :text
- @namespaces = options[:namespaces]
- @callback = options[:callback]
+ def initialize(name, *args, &block)
+ @wombat_property_name = name
+ @wombat_property_selector = args[0]
+ @wombat_property_format = args[1] || :text
+ @wombat_property_namespaces = args[2]
+ @callback = block
+ end
+
+ def selector
+ @wombat_property_selector
+ end
+
+ def namespaces
+ @wombat_property_namespaces
+ end
+
+ def format
+ @wombat_property_format
end
end
end
View
50 lib/wombat/dsl/property_container.rb
@@ -1,50 +0,0 @@
-#coding: utf-8
-
-module Wombat
- module DSL
- class PropertyContainer < Hash
- attr_accessor :name
-
- def initialize(name = nil)
- @name = name
- end
-
- def method_missing(method, *args, &block)
- property_name = method.to_s
-
- if args.empty? && block
- self[property_name] = PropertyContainer.new(property_name) unless self[property_name]
- block.call self[property_name]
- else
- unless args[1] == :iterator
- self[property_name] = Property.new(
- name: property_name,
- selector: args.first,
- format: args[1],
- namespaces: args[2],
- callback: block)
- else
- it = Iterator.new(property_name, args.first)
- self[property_name] = it
- it.instance_eval(&block) if block
- end
- end
- end
-
- def to_ary
- end
-
- # So that Property::Locators::Iterator can identify this class
- # as an iterator property.
- # TODO: Called by NodeSelector. Fix this
- def format
- :container
- end
-
- def namespaces
- # TODO: Called by NodeSelector. Fix this
- nil
- end
- end
- end
-end
View
48 lib/wombat/dsl/property_group.rb
@@ -0,0 +1,48 @@
+#coding: utf-8
+
+module Wombat
+ module DSL
+ class PropertyGroup < Hash
+ attr_accessor :wombat_property_name
+
+ def initialize(name = nil)
+ @wombat_property_name = name
+ end
+
+ def method_missing(method, *args, &block)
+ property_name = method.to_s
+
+ if args.empty? && block
+ # TODO: Verify if another property with same name already exists
+ # before overwriting
+ property_group = self[property_name] || PropertyGroup.new(property_name)
+ self[property_name] = property_group
+ property_group.instance_eval(&block)
+ else
+ if args[1] == :iterator
+ it = Iterator.new(property_name, args.first)
+ self[property_name] = it
+ it.instance_eval(&block) if block
+ elsif args[1] == :follow
+ it = Follower.new(property_name, args.first)
+ self[property_name] = it
+ it.instance_eval(&block) if block
+ else
+ self[property_name] = Property.new(property_name, *args, &block)
+ end
+ end
+ end
+
+ def to_ary
+ end
+
+ def wombat_property_format
+ :container
+ end
+
+ def wombat_property_namespaces
+ nil
+ end
+ end
+ end
+end
View
5 lib/wombat/processing/parser.rb
@@ -1,5 +1,6 @@
#coding: utf-8
require 'wombat/property/locators/factory'
+require 'wombat/processing/node_selector'
require 'mechanize'
require 'restclient'
@@ -15,12 +16,12 @@ def initialize
def parse(metadata)
@context = parser_for metadata
- Wombat::Property::Locators::Factory.locator_for(metadata, @context).locate
+ Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize)
end
private
def parser_for(metadata)
- url = "#{metadata[:base_url]}#{metadata[:list_page]}"
+ url = "#{metadata[:base_url]}#{metadata[:path]}"
page = nil
parser = nil
begin
View
17 lib/wombat/property/locators/base.rb
@@ -8,21 +8,24 @@ module Locators
class Base
include Wombat::Processing::NodeSelector
- def initialize(property, context)
+ def initialize(property)
@property = property
- @context = context
end
- def locate
+ def locate(context, page = nil)
+ @context = context
+
raw_data = yield if block_given?
data = @property.respond_to?(:callback) && @property.callback ? @property.callback.call(raw_data) : raw_data
-
- @property.name ? { @property.name => data } : data
+
+ @property.wombat_property_name ? { @property.wombat_property_name => data } : data
end
protected
- def locate_nodes
- select_nodes @property.selector, @property.namespaces
+ def locate_nodes(context)
+ @context = context
+
+ select_nodes @property.wombat_property_selector, @property.wombat_property_namespaces
end
end
end
View
10 lib/wombat/property/locators/factory.rb
@@ -3,7 +3,7 @@
require 'wombat/property/locators/follow'
require 'wombat/property/locators/html'
require 'wombat/property/locators/iterator'
-require 'wombat/property/locators/property_container'
+require 'wombat/property/locators/property_group'
require 'wombat/property/locators/list'
require 'wombat/property/locators/text'
@@ -13,8 +13,8 @@ module Wombat
module Property
module Locators
module Factory
- def self.locator_for(property, context)
- klass = case(property.format)
+ def self.locator_for(property)
+ klass = case(property.wombat_property_format)
when :text
Text
when :list
@@ -24,14 +24,14 @@ def self.locator_for(property, context)
when :iterator
Iterator
when :container
- PropertyContainer
+ PropertyGroup
when :follow
Follow
else
raise Wombat::Property::Locators::UnknownTypeException.new("Unknown property format #{property.format}.")
end
- klass.new(property, context)
+ klass.new(property)
end
end
end
View
15 lib/wombat/property/locators/follow.rb
@@ -4,7 +4,20 @@ module Wombat
module Property
module Locators
class Follow < Base
- def locate
+ def locate(context, page = nil)
+ super do
+ locate_nodes(context).flat_map do |node|
+ target_page = page.click node
+ context = target_page.parser
+
+ Hash.new.tap do |h|
+ @property.values
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
+ .map { |p| Factory.locator_for(p).locate(context, page) }
+ .map { |p| h.merge! p }
+ end
+ end
+ end
end
end
end
View
4 lib/wombat/property/locators/html.rb
@@ -4,8 +4,8 @@ module Wombat
module Property
module Locators
class Html < Base
- def locate
- node = locate_nodes.first
+ def locate(context, page = nil)
+ node = locate_nodes(context).first
super { node.inner_html.strip }
end
end
View
10 lib/wombat/property/locators/iterator.rb
@@ -1,17 +1,17 @@
#coding: utf-8
-require 'wombat/property/locators/property_container'
+require 'wombat/property/locators/property_group'
module Wombat
module Property
module Locators
class Iterator < Base
- def locate
+ def locate(contex, page = nil)
super do
- locate_nodes.flat_map do |node|
+ locate_nodes(contex).flat_map do |node|
Hash.new.tap do |h|
@property.values
- .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyContainer) }
- .map { |p| Factory.locator_for(p, node).locate }
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
+ .map { |p| Factory.locator_for(p).locate(node, page) }
.map { |p| h.merge! p }
end
end
View
4 lib/wombat/property/locators/list.rb
@@ -4,9 +4,9 @@ module Wombat
module Property
module Locators
class List < Base
- def locate
+ def locate(context, page = nil)
super do
- locate_nodes.map do |n|
+ locate_nodes(context).map do |n|
n.is_a?(String) ? n.strip : n.inner_text.strip
end
end
View
8 ...t/property/locators/property_container.rb → ...ombat/property/locators/property_group.rb
@@ -3,13 +3,13 @@
module Wombat
module Property
module Locators
- class PropertyContainer < Base
- def locate
+ class PropertyGroup < Base
+ def locate(context, page = nil)
super do
Hash.new.tap do |h|
@property.values
- .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyContainer) }
- .map { |p| Factory.locator_for(p, @context).locate }
+ .select { |v| v.is_a?(Wombat::DSL::Property) || v.is_a?(Wombat::DSL::PropertyGroup) }
+ .map { |p| Factory.locator_for(p).locate(context, page) }
.map { |p| h.merge! p }
end
end
View
4 lib/wombat/property/locators/text.rb
@@ -4,8 +4,8 @@ module Wombat
module Property
module Locators
class Text < Base
- def locate
- node = locate_nodes.first
+ def locate(context, page = nil)
+ node = locate_nodes(context).first
value =
unless node
View
42 spec/crawler_spec.rb
@@ -17,8 +17,8 @@
end
it 'should provide metadata to yielded block' do
- @crawler.event do |e|
- e.should_not be_nil
+ @crawler.event do
+ self.class.should == Wombat::DSL::PropertyGroup
end
end
@@ -65,7 +65,7 @@
@crawler_instance.should_receive(:parse) do |arg|
prop = arg['some_data']
- prop.name.should == "some_data"
+ prop.wombat_property_name.should == "some_data"
prop.selector.should == "/event/list"
prop.format.should == :html
prop.namespaces.should == "geo"
@@ -76,12 +76,12 @@
end
it 'should be able to specify arbitrary block structure more than once' do
- @crawler.structure do |s|
- s.data "xpath=/xyz"
+ @crawler.structure do
+ data "xpath=/xyz"
end
- @crawler.structure do |s|
- s.another "css=.information"
+ @crawler.structure do
+ another "css=.information"
end
@crawler_instance.should_receive(:parse) do |arg|
@@ -106,54 +106,54 @@
it 'should crawl with block' do
@crawler.base_url "danielnc.com"
- @crawler.list_page "/itens"
+ @crawler.path "/itens"
@crawler_instance.should_receive(:parse) do |arg|
arg[:base_url].should == "danielnc.com"
- arg[:list_page].should == "/itens/1"
+ arg[:path].should == "/itens/1"
end
@crawler_instance.crawl do
- list_page "/itens/1"
+ path "/itens/1"
end
another_instance = @crawler.new
another_instance.should_receive(:parse) do |arg|
arg[:base_url].should == "danielnc.com"
- arg[:list_page].should == "/itens"
+ arg[:path].should == "/itens"
end
another_instance.crawl
end
it 'should remove created method missing' do
@crawler.base_url "danielnc.com"
- @crawler.list_page "/itens"
+ @crawler.path "/itens"
@crawler_instance.should_receive(:parse) do |arg|
arg[:base_url].should == "danielnc.com"
- arg[:list_page].should == "/itens/1"
+ arg[:path].should == "/itens/1"
end
@crawler_instance.crawl do
- list_page "/itens/1"
+ path "/itens/1"
end
lambda { @craler_intance.undefined_method }.should raise_error(NoMethodError)
end
it 'should remove created instance variable' do
@crawler.base_url "danielnc.com"
- @crawler.list_page "/itens"
+ @crawler.path "/itens"
@crawler_instance.should_receive(:parse) do |arg|
arg[:base_url].should == "danielnc.com"
- arg[:list_page].should == "/itens/1"
+ arg[:path].should == "/itens/1"
end
@crawler_instance.crawl do
- list_page "/itens/1"
+ path "/itens/1"
end
@crawler_instance.instance_variables.index(:@metadata_dup).should be_nil
@@ -164,7 +164,7 @@
VCR.use_cassette('basic_crawler_page') do
@crawler.base_url "http://www.terra.com.br"
- @crawler.list_page '/portal'
+ @crawler.path '/portal'
@crawler.search "css=.btn-search"
@@ -177,7 +177,7 @@
VCR.use_cassette('basic_crawler_page') do
@crawler.base_url "http://www.terra.com.br"
- @crawler.list_page '/portal'
+ @crawler.path '/portal'
@crawler.search "css=.btn-search"
@crawler.document_format :xml
@@ -191,7 +191,7 @@
VCR.use_cassette('error_page') do
@crawler.base_url "http://www.terra.com.br"
- @crawler.list_page '/portal'
+ @crawler.path '/portal'
@crawler.search "css=.btn-search"
@@ -204,7 +204,7 @@
VCR.use_cassette('error_page') do
@crawler.base_url "http://www.terra.com.br"
- @crawler.list_page '/portal'
+ @crawler.path '/portal'
@crawler.search "css=.btn-search"
@crawler.document_format :xml
View
10 spec/dsl/property_spec.rb
@@ -2,15 +2,11 @@
describe Wombat::DSL::Property do
it 'should store property data' do
- property = Wombat::DSL::Property.new(
- name: "title",
- selector: "/some/selector",
- format: :html,
- callback: lambda {})
+ property = Wombat::DSL::Property.new("title", *["/some/selector", :html]) { false }
- property.name.should == "title"
+ property.wombat_property_name.should == "title"
property.selector.should == "/some/selector"
property.format.should == :html
- property.callback.should == lambda {}
+ property.callback.should == lambda { false }
end
end
View
17 spec/helpers/sample_crawler.rb
@@ -5,9 +5,9 @@ class SampleCrawler
include Wombat::Crawler
base_url "http://www.obaoba.com.br"
- list_page "/porto-alegre/agenda"
+ path "/porto-alegre/agenda"
- for_each "css=div.title-agenda" do
+ event_group "css=div.title-agenda", :iterator do
event do |e|
e.title("xpath=.") { |t| t.split(" | ")[1].strip }
e.date "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a" do |d|
@@ -19,18 +19,5 @@ class SampleCrawler
venue do |v|
v.name("xpath=.") { |n| name.split(" | ")[2].strip }
end
-
- # follow_links "xpath=.//a[1]/@href" do
- # event { |e| e.description "css=#main-node-content", :html }
- # venue do |v|
- # v.phone "css=span.tel .value"
- # v.image "xpath=//div[@id='article-image']/div/img/@src"
- # end
-
- # location do |l|
- # l.city "css=span.locality"
- # l.street("css=span.street-address") { |s| s.gsub(/\n/, '').gsub(/ /, '') }
- # end
- # end
end
end
View
57 spec/integration/integration_spec.rb
@@ -8,17 +8,15 @@
crawler.send(:include, Wombat::Crawler)
crawler.base_url "http://www.terra.com.br"
- crawler.list_page '/portal'
+ crawler.path '/portal'
crawler.search "css=.btn-search"
- crawler.social do |s|
- s.twitter "css=.ctn-bar li.last"
+ crawler.social do
+ twitter "css=.ctn-bar li.last"
end
-
crawler.links "css=.ctn-links", :iterator do
menu "css=a"
end
-
crawler.subheader "css=h2.ttl-dynamic" do |h|
h.gsub("London", "Londres")
end
@@ -39,7 +37,7 @@
crawler.send(:include, Wombat::Crawler)
crawler.base_url "http://www.terra.com.br"
- crawler.list_page '/portal'
+ crawler.path '/portal'
crawler.links "css=.ctn-links", :iterator do
menu "css=a"
@@ -69,12 +67,12 @@
crawler_instance = crawler.new
results = crawler_instance.crawl do
base_url "http://www.terra.com.br"
- list_page '/portal'
+ path '/portal'
search "css=.btn-search"
- social do |s|
- s.twitter "css=.ctn-bar li.last"
+ social do
+ twitter "css=.ctn-bar li.last"
end
links "css=.ctn-links", :iterator do
@@ -97,12 +95,12 @@
VCR.use_cassette('basic_crawler_page') do
results = Wombat.crawl do
base_url "http://www.terra.com.br"
- list_page '/portal'
+ path '/portal'
search "css=.btn-search"
- social do |s|
- s.twitter "css=.ctn-bar li.last"
+ social do
+ twitter "css=.ctn-bar li.last"
end
links "css=.ctn-links", :iterator do
@@ -127,12 +125,12 @@
crawler.send(:include, Wombat::Crawler)
crawler.base_url "https://www.github.com"
- crawler.list_page "/explore"
+ crawler.path "/explore"
crawler.repos "css=ol.ranked-repositories>li", :iterator do
- project do |p|
- p.repo 'css=h3'
- p.description('css=p.description') { |d| d ? d.gsub(/for/, '') : nil }
+ project do
+ repo 'css=h3'
+ description('css=p.description') { |d| d ? d.gsub(/for/, '') : nil }
end
end
@@ -156,7 +154,7 @@
crawler.document_format :xml
crawler.base_url "http://ws.audioscrobbler.com"
- crawler.list_page "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
+ crawler.path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
crawler.artist "xpath=//title", :list
@@ -191,26 +189,27 @@
crawler = Class.new
crawler.send(:include, Wombat::Crawler)
- crawler.document_format :html
crawler.base_url "https://www.github.com"
- crawler.list_page "/"
+ crawler.path "/"
crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do
- #heading 'css=h1'
+ heading 'css=h1'
end
crawler_instance = crawler.new
results = crawler_instance.crawl
- results.should == { "github" => [
- { "heading"=>"GitHub helps people build software together."},
- { "heading"=>""},
- { "heading"=>"Features"},
- { "heading"=>"Contact GitHub"},
- { "heading"=>"GitHub Training — Git Training from the Experts"},
- { "heading"=>"GitHub on Your Servers"},
- { "heading"=>"Battle station fully operational"}
- ]}
+ results.should == {
+ "github" => [
+ { "heading"=>"GitHub helps people build software together." },
+ { "heading"=>nil },
+ { "heading"=>"Features" },
+ { "heading"=>"Contact GitHub" },
+ { "heading"=>"GitHub Training — Git Training from the Experts" },
+ { "heading"=>"GitHub on Your Servers" },
+ { "heading"=>"Loading..." }
+ ]
+ }
end
end
end
View
77 spec/processing/parser_spec.rb
@@ -10,7 +10,7 @@
it 'should request page document with correct url' do
@metadata.base_url "http://www.google.com"
- @metadata.list_page "/search"
+ @metadata.path "/search"
fake_document = double :document
fake_parser = double :parser
fake_document.should_receive(:parser).and_return(fake_parser)
@@ -19,81 +19,6 @@
@parser.parse @metadata
end
- it 'should send correct data to locate method' do
- fake_document = double :document
- fake_parser = double :parser
- fake_document.should_receive(:parser).and_return(fake_parser)
- @parser.mechanize.stub(:get).and_return fake_document
- @parser.should_not_receive :locate
- @parser.parse @metadata
- end
-
- it 'should invoke metadata callbacks' do
- fake_document = double :document
- fake_parser = double :parser
- property = double :property
- block_called = false
- block = lambda { |p| block_called = true }
-
- property.stub(:result)
- fake_document.should_receive(:parser).and_return(fake_parser)
- property.should_receive(:callback).twice.and_return(block)
- property.should_receive(:result=).with(true)
-
- @parser.mechanize.stub(:get).and_return fake_document
- @metadata.stub(:all_properties).and_return [property]
- @parser.should_receive(:locate).with(property)
-
- @parser.parse @metadata
-
- block_called.should be_true
- end
-
- it 'should invoke callback with parsed data' do
- fake_document = double :document
- fake_parser = double :parser
- property = double :property
- block_called = false
- block = lambda { |p|
- block_called = true
- p.should == "blah"
- }
-
- fake_document.should_receive(:parser).and_return(fake_parser)
- property.should_receive(:callback).twice.and_return(block)
- property.should_receive(:result=).with(true)
-
- @parser.mechanize.stub(:get).and_return fake_document
- @metadata.stub(:all_properties).and_return [property]
- @parser.should_receive(:locate).with(property).and_return("blah")
-
- @parser.parse @metadata
-
- block_called.should be_true
- end
-
- it 'should not include null results in iterated block' do
- fake_parser = double :parser
- fake_document = double :document
- c1 = double :context
- c2 = double :context
- it = Wombat::Iterator.new "it_selector"
- it.prop_1 "some_selector"
-
- @metadata.should_receive(:iterators).and_return [it]
- @metadata.should_receive(:flatten)
- fake_document.should_receive(:parser).and_return(fake_parser)
- @parser.mechanize.stub(:get).and_return fake_document
- @parser.should_receive(:select_nodes).with("it_selector").and_return [c1, c2]
- @parser.should_receive(:locate).with(it['prop_1']).and_return(12)
- @parser.should_receive(:locate).with(it['prop_1']).and_return(nil)
- @parser.stub(:locate)
-
- @parser.parse(@metadata)
-
- it["prop_1"].result.should == [12]
- end
-
it 'should correctly parse xml documents' do
fake_document = double :xml
fake_parser = double :parser
View
14 spec/property/locators/factory_spec.rb
@@ -2,17 +2,17 @@
describe Wombat::Property::Locators::Factory do
it 'should instantiate correct locator according to property type' do
- Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :text), nil).should be_a(Wombat::Property::Locators::Text)
- Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :html), nil).should be_a(Wombat::Property::Locators::Html)
- Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :list), nil).should be_a(Wombat::Property::Locators::List)
- Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :follow), nil).should be_a(Wombat::Property::Locators::Follow)
- Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :iterator), nil).should be_a(Wombat::Property::Locators::Iterator)
- Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :container), nil).should be_a(Wombat::Property::Locators::PropertyContainer)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :text)).should be_a(Wombat::Property::Locators::Text)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :html)).should be_a(Wombat::Property::Locators::Html)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :list)).should be_a(Wombat::Property::Locators::List)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :follow)).should be_a(Wombat::Property::Locators::Follow)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :iterator)).should be_a(Wombat::Property::Locators::Iterator)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :container)).should be_a(Wombat::Property::Locators::PropertyGroup)
end
it 'should raise correct exception if provided property is of unknown type' do
lambda {
- Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(format: :weird), nil)
+ Wombat::Property::Locators::Factory.locator_for(Wombat::DSL::Property.new(nil, nil, :weird))
}.should raise_error(Wombat::Property::Locators::UnknownTypeException, "Unknown property format weird.")
end
end
View
6 spec/property/locators/html_spec.rb
@@ -6,10 +6,10 @@
context = double :context
fake_elem.stub inner_html: "Something cool "
context.stub(:xpath).with("/abc", nil).and_return [fake_elem]
- property = Wombat::DSL::Property.new(name: 'data1', selector: 'xpath=/abc', format: :html)
+ property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
- locator = Wombat::Property::Locators::Html.new(property, context)
+ locator = Wombat::Property::Locators::Html.new(property)
- locator.locate.should == { "data1" => "Something cool" }
+ locator.locate(context).should == { "data1" => "Something cool" }
end
end
View
6 spec/property/locators/list_spec.rb
@@ -4,10 +4,10 @@
it 'should locate a list of nodes' do
context = double :context
context.stub(:css).with(".selector").and_return %w(1 2 3 4 5)
- property = Wombat::DSL::Property.new(name: 'data1', selector: 'css=.selector', format: :list)
+ property = Wombat::DSL::Property.new('data1', 'css=.selector', :list)
- locator = Wombat::Property::Locators::List.new(property, context)
+ locator = Wombat::Property::Locators::List.new(property)
- locator.locate.should == { "data1" => %w(1 2 3 4 5) }
+ locator.locate(context).should == { "data1" => %w(1 2 3 4 5) }
end
end
View
24 spec/property/locators/text_spec.rb
@@ -6,44 +6,44 @@
context = double :context
fake_elem.stub inner_text: "Something cool "
context.stub(:xpath).with("/abc", 'boom').and_return [fake_elem]
- property = Wombat::DSL::Property.new(name: 'data1', selector: 'xpath=/abc', namespaces: 'boom', format: :text)
+ property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :text, 'boom')
- locator = Wombat::Property::Locators::Text.new(property, context)
+ locator = Wombat::Property::Locators::Text.new(property)
- locator.locate.should == { "data1" => "Something cool" }
+ locator.locate(context).should == { "data1" => "Something cool" }
end
it 'should locate text property with css selector' do
fake_elem = double :element
context = double :context
fake_elem.stub inner_text: "My name"
context.stub(:css).with("/def").and_return [fake_elem]
- property = Wombat::DSL::Property.new(name: 'data1', selector: 'css=/def', format: :text)
+ property = Wombat::DSL::Property.new('data1', 'css=/def', :text)
- locator = Wombat::Property::Locators::Text.new(property, context)
+ locator = Wombat::Property::Locators::Text.new(property)
- locator.locate.should == { "data1" => "My name" }
+ locator.locate(context).should == { "data1" => "My name" }
end
it 'should return plain symbols as strings' do
fake_elem = double :element
context = double :context
- property = Wombat::DSL::Property.new(name: 'data_2', selector: :hardcoded_value, format: :text)
+ property = Wombat::DSL::Property.new('data_2', :hardcoded_value, :text)
- locator = Wombat::Property::Locators::Text.new(property, context)
+ locator = Wombat::Property::Locators::Text.new(property)
- locator.locate.should == { "data_2" => "hardcoded_value" }
+ locator.locate(context).should == { "data_2" => "hardcoded_value" }
end
it 'should invoke property callback' do
fake_elem = double :element
context = double :context
fake_elem.stub inner_text: "My name"
context.stub(:css).with("/def").and_return [fake_elem]
- property = Wombat::DSL::Property.new(name: 'data1', selector: 'css=/def', format: :text, callback: Proc.new { |s| s.gsub(/name/, 'ass') })
+ property = Wombat::DSL::Property.new('data1', 'css=/def', :text) { |s| s.gsub(/name/, 'ass') }
- locator = Wombat::Property::Locators::Text.new(property, context)
+ locator = Wombat::Property::Locators::Text.new(property)
- locator.locate.should == { "data1" => "My ass" }
+ locator.locate(context).should == { "data1" => "My ass" }
end
end
View
18 spec/sample_crawler_spec.rb
@@ -8,19 +8,15 @@
it 'should correctly assign event metadata' do
@sample_crawler.should_receive(:parse) do |args|
- # args["event"]["description"].selector.should == "css=#main-node-content"
-
- # args["venue"]["address"].selector.should == "324 Dom Pedro II Street"
-
- it = args.iterators.first
- it.selector.should == "css=div.title-agenda"
- it["event"]["title"].selector.should == "xpath=."
- it["event"]["date"].selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
- it["event"]["type"].selector.should == "xpath=.type"
- it["venue"]["name"].selector.should == "xpath=."
+ args['event_group'].wombat_property_selector.should == "css=div.title-agenda"
+ it = args['event_group']
+ it["event"]["title"].wombat_property_selector.should == "xpath=."
+ it["event"]["date"].wombat_property_selector.should == "xpath=//div[@class='scrollable-items']/div[@class='s-item active']//a"
+ it["event"]["type"].wombat_property_selector.should == "xpath=.type"
+ it["venue"]["name"].wombat_property_selector.should == "xpath=."
args[:base_url].should == 'http://www.obaoba.com.br'
- args[:list_page].should == '/porto-alegre/agenda'
+ args[:path].should == '/porto-alegre/agenda'
end
@sample_crawler.crawl
View
2 spec/wombat_spec.rb
@@ -22,7 +22,7 @@
lambda {
Wombat.crawl do
base_url "http://www.github.com"
- list_page "/"
+ path "/"
source :obaoba
description 'Oba Oba'
View
47 wombat.gemspec
@@ -5,12 +5,12 @@
Gem::Specification.new do |s|
s.name = "wombat"
- s.version = "1.0.0"
+ s.version = "2.0.0"
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
s.authors = ["Felipe Lima"]
- s.date = "2012-06-25"
- s.description = "Web scraper with a DSL that parses structured data from web pages"
+ s.date = "2012-07-31"
+ s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
s.email = "felipe.lima@gmail.com"
s.extra_rdoc_files = [
"LICENSE.txt",
@@ -30,37 +30,48 @@ Gem::Specification.new do |s|
"fixtures/vcr_cassettes/basic_crawler_page.yml",
"fixtures/vcr_cassettes/broken_selector.yml",
"fixtures/vcr_cassettes/error_page.yml",
+ "fixtures/vcr_cassettes/follow_links.yml",
"fixtures/vcr_cassettes/for_each_page.yml",
"fixtures/vcr_cassettes/xml_with_namespace.yml",
"lib/wombat.rb",
"lib/wombat/crawler.rb",
- "lib/wombat/iterator.rb",
- "lib/wombat/metadata.rb",
- "lib/wombat/node_selector.rb",
- "lib/wombat/parser.rb",
- "lib/wombat/property.rb",
- "lib/wombat/property_container.rb",
- "lib/wombat/property_locator.rb",
+ "lib/wombat/dsl/follower.rb",
+ "lib/wombat/dsl/iterator.rb",
+ "lib/wombat/dsl/metadata.rb",
+ "lib/wombat/dsl/property.rb",
+ "lib/wombat/dsl/property_group.rb",
+ "lib/wombat/processing/node_selector.rb",
+ "lib/wombat/processing/parser.rb",
+ "lib/wombat/property/locators/base.rb",
+ "lib/wombat/property/locators/factory.rb",
+ "lib/wombat/property/locators/follow.rb",
+ "lib/wombat/property/locators/html.rb",
+ "lib/wombat/property/locators/iterator.rb",
+ "lib/wombat/property/locators/list.rb",
+ "lib/wombat/property/locators/property_group.rb",
+ "lib/wombat/property/locators/text.rb",
"spec/crawler_spec.rb",
+ "spec/dsl/property_spec.rb",
"spec/helpers/sample_crawler.rb",
"spec/integration/integration_spec.rb",
- "spec/iterator_spec.rb",
- "spec/metadata_spec.rb",
- "spec/parser_spec.rb",
- "spec/property_container_spec.rb",
- "spec/property_locator_spec.rb",
- "spec/property_spec.rb",
+ "spec/processing/parser_spec.rb",
+ "spec/property/locators/factory_spec.rb",
+ "spec/property/locators/follow_spec.rb",
+ "spec/property/locators/html_spec.rb",
+ "spec/property/locators/iterator_spec.rb",
+ "spec/property/locators/list_spec.rb",
+ "spec/property/locators/text_spec.rb",
"spec/sample_crawler_spec.rb",
"spec/spec_helper.rb",
"spec/wombat_spec.rb",
"wombat.gemspec"
]
- s.homepage = "http://github.com/felipecsl/wombat"
+ s.homepage = "http://felipecsl.github.com/wombat"
s.licenses = ["MIT"]
s.require_paths = ["lib"]
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
s.rubygems_version = "1.8.24"
- s.summary = "Ruby DSL to crawl web pages"
+ s.summary = "Ruby DSL to scrape web pages"
if s.respond_to? :specification_version then
s.specification_version = 3

No commit comments for this range

Something went wrong with that request. Please try again.