Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

use per-crawl options instead of specifying options at the module level

  • Loading branch information...
commit 32153103240b1c34b8384b5eb691164c83efd1d6 1 parent 378a15e
@chriskite authored
View
2  CHANGELOG.rdoc
@@ -13,5 +13,5 @@
* Minor enhancements
- *HTTP request response time recorded in Page.
+ * HTTP request response time recorded in Page.
* Use of persistent HTTP connections.
View
3  anemone.gemspec
@@ -1,6 +1,6 @@
spec = Gem::Specification.new do |s|
s.name = "anemone"
- s.version = "0.2.2"
+ s.version = "0.2.3"
s.author = "Chris Kite"
s.homepage = "http://anemone.rubyforge.org"
s.rubyforge_project = "anemone"
@@ -20,7 +20,6 @@ spec = Gem::Specification.new do |s|
README.rdoc
bin/anemone
lib/anemone.rb
- lib/anemone/anemone.rb
lib/anemone/core.rb
lib/anemone/http.rb
lib/anemone/page.rb
View
2  lib/anemone.rb
@@ -1,2 +1,2 @@
require 'rubygems'
-require 'anemone/anemone'
+require 'anemone/core'
View
44 lib/anemone/anemone.rb
@@ -1,44 +0,0 @@
-require 'ostruct'
-require 'robots'
-require 'anemone/core'
-
-module Anemone
- # Version number
- VERSION = '0.2.2'
-
- # default options
- DEFAULTS = {
- # run 4 Tentacle threads to fetch pages
- :threads => 4,
- # disable verbose output
- :verbose => false,
- # don't throw away the page response body after scanning it for links
- :discard_page_bodies => false,
- # identify self as Anemone/VERSION
- :user_agent => "Anemone/#{VERSION}",
- # no delay between requests
- :delay => 0,
- # don't obey the robots exclusion protocol
- :obey_robots_txt => false,
- # by default, don't limit the depth of the crawl
- :depth_limit => false,
- # number of times HTTP redirects will be followed
- :redirect_limit => 5
- }
-
- def self.options
- @options ||= OpenStruct.new(DEFAULTS)
- end
-
- #
- # Convenience method to start a crawl using Core
- #
- def Anemone.crawl(urls, options = {}, &block)
- options.each { |key, value| Anemone.options.send("#{key}=", value) }
-
- #use a single thread if a delay was requested
- Anemone.options.threads = 1 if Anemone.options.delay > 0
-
- Core.crawl(urls, &block)
- end
-end
View
75 lib/anemone/core.rb
@@ -1,19 +1,51 @@
-require 'net/http'
require 'thread'
+require 'robots'
require 'anemone/tentacle'
require 'anemone/page'
require 'anemone/page_hash'
module Anemone
+
+ VERSION = '0.2.3';
+
+ #
+ # Convenience method to start a crawl
+ #
+ def Anemone.crawl(urls, options = {}, &block)
+ Core.crawl(urls, options, &block)
+ end
+
class Core
# PageHash storing all Page objects encountered during the crawl
attr_reader :pages
-
+
+ # Hash of options for the crawl
+ attr_accessor :opts
+
+ DEFAULT_OPTS = {
+ # run 4 Tentacle threads to fetch pages
+ :threads => 4,
+ # disable verbose output
+ :verbose => false,
+ # don't throw away the page response body after scanning it for links
+ :discard_page_bodies => false,
+ # identify self as Anemone/VERSION
+ :user_agent => "Anemone/#{Anemone::VERSION}",
+ # no delay between requests
+ :delay => 0,
+ # don't obey the robots exclusion protocol
+ :obey_robots_txt => false,
+ # by default, don't limit the depth of the crawl
+ :depth_limit => false,
+ # number of times HTTP redirects will be followed
+ :redirect_limit => 5
+ }
+
#
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
# and optional *block*
#
- def initialize(urls)
+ def initialize(urls, opts = {})
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
@urls.each{ |url| url.path = '/' if url.path.empty? }
@@ -23,10 +55,8 @@ def initialize(urls)
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@skip_link_patterns = []
@after_crawl_blocks = []
-
- if Anemone.options.obey_robots_txt
- @robots = Robots.new(Anemone.options.user_agent)
- end
+
+ process_options opts
yield self if block_given?
end
@@ -34,8 +64,8 @@ def initialize(urls)
#
# Convenience method to start a new crawl
#
- def self.crawl(root)
- self.new(root) do |core|
+ def self.crawl(urls, opts = {})
+ self.new(urls, opts) do |core|
yield core if block_given?
core.run
end
@@ -104,8 +134,8 @@ def run
link_queue = Queue.new
page_queue = Queue.new
- Anemone.options.threads.times do
- @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
+ @opts[:threads].times do
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
end
@urls.each{ |url| link_queue.enq(url) }
@@ -115,12 +145,12 @@ def run
@pages[page.url] = page
- puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
# perform the on_every_page blocks for this page
do_page_blocks(page)
- page.discard_doc! if Anemone.options.discard_page_bodies
+ page.discard_doc! if @opts[:discard_page_bodies]
links_to_follow(page).each do |link|
link_queue.enq([link, page])
@@ -158,7 +188,15 @@ def run
end
private
-
+
+ def process_options(options)
+ @opts = DEFAULT_OPTS.merge options
+
+ @opts[:threads] = 1 if @opts[:delay] > 0
+
+ @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+ end
+
#
# Execute the after_crawl blocks
#
@@ -199,10 +237,10 @@ def links_to_follow(page)
# Returns +false+ otherwise.
#
def visit_link?(link, from_page = nil)
- allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
+ allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
- if from_page
- too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
+ if from_page && @opts[:depth_limit]
+ too_deep = from_page.depth >= @opts[:depth_limit]
else
too_deep = false
end
@@ -215,8 +253,7 @@ def visit_link?(link, from_page = nil)
# its URL matches a skip_link pattern.
#
def skip_link?(link)
- @skip_link_patterns.each { |p| return true if link.path =~ p}
- false
+ @skip_link_patterns.any? { |p| link.path =~ p }
end
end
View
23 lib/anemone/http.rb
@@ -4,10 +4,11 @@
module Anemone
class HTTP
# Maximum number of redirects to follow on each get_response
- REDIRECTION_LIMIT = 5
+ REDIRECT_LIMIT = 5
- def initialize
+ def initialize(opts = {})
@connections = {}
+ @opts = opts
end
#
@@ -31,7 +32,7 @@ def fetch_page(url, from_page = nil)
return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
rescue => e
- if Anemone.options.verbose
+ if verbose?
puts e.inspect
puts e.backtrace
end
@@ -50,7 +51,7 @@ def get(url, referer = nil)
code = Integer(response.code)
loc = url
- limit = REDIRECTION_LIMIT
+ limit = redirect_limit
while response.is_a?(Net::HTTPRedirection) and limit > 0
loc = URI(response['location'])
loc = url.merge(loc) if loc.relative?
@@ -66,7 +67,6 @@ def get(url, referer = nil)
#
def get_response(url, referer = nil)
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
- user_agent = Anemone.options.user_agent rescue nil
opts = {}
opts['User-Agent'] = user_agent if user_agent
@@ -104,5 +104,18 @@ def refresh_connection(url)
end
@connections[url.host][url.port] = http.start
end
+
+ def redirect_limit
+ @opts[:redirect_limit] || REDIRECT_LIMIT
+ end
+
+ def user_agent
+ @opts[:user_agent]
+ end
+
+ def verbose?
+ @opts[:verbose]
+ end
+
end
end
View
15 lib/anemone/tentacle.rb
@@ -6,10 +6,11 @@ class Tentacle
#
# Create a new Tentacle
#
- def initialize(link_queue, page_queue)
+ def initialize(link_queue, page_queue, opts = {})
@link_queue = link_queue
@page_queue = page_queue
- @http = Anemone::HTTP.new
+ @http = Anemone::HTTP.new(opts)
+ @opts = opts
end
#
@@ -22,11 +23,17 @@ def run
break if link == :END
- @page_queue.enq @http.fetch_page(link, from_page)
+ @page_queue << @http.fetch_page(link, from_page)
- sleep Anemone.options.delay
+ delay
end
end
+ private
+
+ def delay
+ sleep @opts[:delay] if @opts[:delay]
+ end
+
end
end
View
33 spec/anemone_spec.rb
@@ -1,42 +1,11 @@
require File.dirname(__FILE__) + '/spec_helper'
describe Anemone do
-
- after(:each) do
- # reset global options object to defaults
- Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
- end
-
+
it "should have a version" do
Anemone.const_defined?('VERSION').should == true
end
- it "should have options" do
- Anemone.should respond_to(:options)
- end
-
- it "should accept options for the crawl" do
- Anemone.crawl(SPEC_DOMAIN, :verbose => false,
- :threads => 2,
- :discard_page_bodies => true,
- :user_agent => 'test',
- :obey_robots_txt => true,
- :depth_limit => 3)
-
- Anemone.options.verbose.should == false
- Anemone.options.threads.should == 2
- Anemone.options.discard_page_bodies.should == true
- Anemone.options.delay.should == 0
- Anemone.options.user_agent.should == 'test'
- Anemone.options.obey_robots_txt.should == true
- Anemone.options.depth_limit.should == 3
- end
-
- it "should use 1 thread if a delay is requested" do
- Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
- Anemone.options.threads.should == 1
- end
-
it "should return a Anemone::Core from the crawl, which has a PageHash" do
result = Anemone.crawl(SPEC_DOMAIN)
result.should be_an_instance_of(Anemone::Core)
View
24 spec/core_spec.rb
@@ -173,5 +173,29 @@ module Anemone
core.should have(4).pages
end
end
+
+ describe "options" do
+ it "should accept options for the crawl" do
+ core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
+ :threads => 2,
+ :discard_page_bodies => true,
+ :user_agent => 'test',
+ :obey_robots_txt => true,
+ :depth_limit => 3)
+
+ core.opts[:verbose].should == false
+ core.opts[:threads].should == 2
+ core.opts[:discard_page_bodies].should == true
+ core.opts[:delay].should == 0
+ core.opts[:user_agent].should == 'test'
+ core.opts[:obey_robots_txt].should == true
+ core.opts[:depth_limit].should == 3
+ end
+
+ it "should use 1 thread if a delay is requested" do
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
+ end
+ end
+
end
end
Please sign in to comment.
Something went wrong with that request. Please try again.