Merge branch 'next'

chriskite · Sep 2, 2010 · 734e3bb · 734e3bb
2 parents 7ba9e1a + 2c974c2
commit 734e3bb
Show file tree

Hide file tree

Showing 20 changed files with 550 additions and 65 deletions.
diff --git a/CHANGELOG.rdoc b/CHANGELOG.rdoc
@@ -1,3 +1,19 @@
+== 0.5.0 / 2010-09-01
+
+* Major enhancements
+
+  * Added page storage engines for MongoDB and Redis
+
+* Minor enhancements
+
+  * Use xpath for link parsing instead of CSS (faster) (Marc Seeger)
+  * Added skip_query_strings option to skip links with query strings (Joost Baaij)
+
+* Bug fixes
+
+  * Only consider status code 300..307 a redirect (Marc Seeger)
+  * Canonicalize redirect links (Marc Seeger)
+
 == 0.4.0 / 2010-04-08
 
 * Major enchancements

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -0,0 +1,4 @@
+Many thanks to the following folks who have contributed code to Anemone. In no particular order:
+
+Marc Seeger
+Joost Baaij
diff --git a/README.rdoc b/README.rdoc
@@ -8,19 +8,29 @@ See http://anemone.rubyforge.org for more information.
 
 == Features
 * Multi-threaded design for high performance
-* Tracks 301 HTTP redirects to understand a page's aliases
+* Tracks 301 HTTP redirects
 * Built-in BFS algorithm for determining page depth
 * Allows exclusion of URLs based on regular expressions
 * Choose the links to follow on each page with focus_crawl()
 * HTTPS support
 * Records response time for each page
 * CLI program can list all pages in a domain, calculate page depths, and more
 * Obey robots.txt
-* In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
+* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis
 
 == Examples
 See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
 
 == Requirements
 * nokogiri
 * robots
+
+== Development
+To test and develop this gem, additional requirements are:
+* rspec
+* fakeweb
+* tokyocabinet
+* mongo
+* redis
+
+You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
diff --git a/Rakefile b/Rakefile
@@ -0,0 +1,26 @@
+require 'rubygems'
+require 'rake'
+
+require 'spec/rake/spectask'
+Spec::Rake::SpecTask.new(:spec) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.spec_files = FileList['spec/**/*_spec.rb']
+end
+
+Spec::Rake::SpecTask.new(:rcov) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.rcov = true
+end
+
+task :default => :spec
+
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "anemone #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end
diff --git a/VERSION b/VERSION
@@ -0,0 +1 @@
+0.5.0
diff --git a/anemone.gemspec b/anemone.gemspec
@@ -1,6 +1,6 @@
 spec = Gem::Specification.new do |s|
   s.name = "anemone"
-  s.version = "0.4.0"
+  s.version = "0.5.0"
   s.author = "Chris Kite"
   s.homepage = "http://anemone.rubyforge.org"
   s.rubyforge_project = "anemone"
@@ -15,37 +15,12 @@ spec = Gem::Specification.new do |s|
   s.add_dependency("robots", ">= 0.7.2")
 
   s.files = %w[
+    VERSION
     LICENSE.txt
     CHANGELOG.rdoc
     README.rdoc
-    bin/anemone
-    lib/anemone.rb
-    lib/anemone/cookie_store.rb
-    lib/anemone/core.rb
-    lib/anemone/http.rb
-    lib/anemone/page.rb
-    lib/anemone/page_store.rb
-    lib/anemone/tentacle.rb
-    lib/anemone/storage.rb
-    lib/anemone/storage/pstore.rb
-    lib/anemone/storage/tokyo_cabinet.rb
-    lib/anemone/cli.rb
-    lib/anemone/cli/url_list.rb
-    lib/anemone/cli/cron.rb
-    lib/anemone/cli/count.rb
-    lib/anemone/cli/pagedepth.rb
-    lib/anemone/cli/serialize.rb
-  ]
+    Rakefile
+  ] + Dir['lib/**/*.rb']
 
-  s.test_files = %w[
-    spec/anemone_spec.rb
-    spec/cookie_store_spec.rb
-    spec/core_spec.rb
-    spec/page_spec.rb
-    spec/page_store_spec.rb
-    spec/http_spec.rb
-    spec/storage_spec.rb
-    spec/fakeweb_helper.rb
-    spec/spec_helper.rb
-  ]
+  s.test_files = Dir['spec/*.rb']
 end
diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb
@@ -2,12 +2,14 @@
 require 'robots'
 require 'anemone/tentacle'
 require 'anemone/page'
+require 'anemone/exceptions'
 require 'anemone/page_store'
 require 'anemone/storage'
+require 'anemone/storage/base'
 
 module Anemone
 
-  VERSION = '0.4.0';
+  VERSION = '0.5.0';
 
   #
   # Convenience method to start a crawl
@@ -45,7 +47,9 @@ class Core
       # Hash of cookie name => value to send with HTTP requests
       :cookies => nil,
       # accept cookies from the server and send them back?
-      :accept_cookies => false
+      :accept_cookies => false,
+      # skip any link with a query string? e.g. http://foo.com/?u=user
+      :skip_query_strings => false
     }
 
     # Create setter methods for all options to be called from the crawl block
@@ -187,7 +191,8 @@ def run
     def process_options
       @opts = DEFAULT_OPTS.merge @opts
       @opts[:threads] = 1 if @opts[:delay] > 0
-      @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
+      storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
+      @pages = PageStore.new(storage)
       @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
 
       freeze_options
@@ -241,15 +246,40 @@ def links_to_follow(page)
     # Returns +false+ otherwise.
     #
     def visit_link?(link, from_page = nil)
-      allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
+      !@pages.has_page?(link) &&
+      !skip_link?(link) &&
+      !skip_query_string?(link) &&
+      allowed(link) &&
+      !too_deep?(from_page)
+    end
+
+    #
+    # Returns +true+ if we are obeying robots.txt and the link
+    # is granted access in it. Always returns +true+ when we are
+    # not obeying robots.txt.
+    #
+    def allowed(link)
+      @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
+    end
 
+    #
+    # Returns +true+ if we are over the page depth limit.
+    # This only works when coming from a page and with the +depth_limit+ option set.
+    # When neither is the case, will always return +false+.
+    def too_deep?(from_page)
       if from_page && @opts[:depth_limit]
-        too_deep = from_page.depth >= @opts[:depth_limit]
+        from_page.depth >= @opts[:depth_limit]
       else
-        too_deep = false
+        false
       end
-
-      !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
+    end
+
+    #
+    # Returns +true+ if *link* should not be visited because
+    # it has a query string and +skip_query_strings+ is true.
+    #
+    def skip_query_string?(link)
+      @opts[:skip_query_strings] && link.query
     end
 
     #

diff --git a/lib/anemone/exceptions.rb b/lib/anemone/exceptions.rb
@@ -0,0 +1,5 @@
+module Anemone
+  class Error < ::StandardError
+    attr_accessor :wrapped_exception
+  end
+end
diff --git a/lib/anemone/http.rb b/lib/anemone/http.rb
@@ -91,7 +91,7 @@ def get(url, referer = nil)
 
           response, response_time = get_response(loc, referer)
           code = Integer(response.code)
-          redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']) : nil
+          redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']).normalize : nil
           yield response, code, loc, redirect_to, response_time
           limit -= 1
       end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0

diff --git a/lib/anemone/page.rb b/lib/anemone/page.rb
@@ -59,8 +59,8 @@ def links
       @links = []
       return @links if !doc
 
-      doc.css('a').each do |a|
-        u = a.attributes['href'].content rescue nil
+      doc.search("//a[@href]").each do |a|
+        u = a['href']
         next if u.nil? or u.empty?
         abs = to_absolute(URI(u)) rescue next
         @links << abs if in_domain?(abs)
@@ -120,7 +120,7 @@ def html?
     # otherwise.
     #
     def redirect?
-      (300..399).include?(@code)
+      (300..307).include?(@code)
     end
 
     #
@@ -165,5 +165,38 @@ def marshal_load(ary)
       @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
     end
 
+    def to_hash
+      {'url' => @url.to_s,
+       'headers' => Marshal.dump(@headers),
+       'data' => Marshal.dump(@data),
+       'body' => @body,
+       'links' => links.map(&:to_s), 
+       'code' => @code,
+       'visited' => @visited,
+       'depth' => @depth,
+       'referer' => @referer.to_s,
+       'redirect_to' => @redirect_to.to_s,
+       'response_time' => @response_time,
+       'fetched' => @fetched}
+    end
+
+    def self.from_hash(hash)
+      page = self.new(URI(hash['url']))
+      {'@headers' => Marshal.load(hash['headers']),
+       '@data' => Marshal.load(hash['data']),
+       '@body' => hash['body'],
+       '@links' => hash['links'].map { |link| URI(link) },
+       '@code' => hash['code'].to_i,
+       '@visited' => hash['visited'],
+       '@depth' => hash['depth'].to_i,
+       '@referer' => hash['referer'],
+       '@redirect_to' => URI(hash['redirect_to']),
+       '@response_time' => hash['response_time'].to_i,
+       '@fetched' => hash['fetched']
+      }.each do |var, value|
+        page.instance_variable_set(var, value)
+      end
+      page
+    end
   end
 end
diff --git a/lib/anemone/storage.rb b/lib/anemone/storage.rb
@@ -2,18 +2,33 @@ module Anemone
   module Storage
 
     def self.Hash(*args)
-      Hash.new(*args)
+      hash = Hash.new(*args)
+      # add close method for compatibility with Storage::Base
+      class << hash; def close; end; end
+      hash
     end
 
     def self.PStore(*args)
       require 'anemone/storage/pstore'
       self::PStore.new(*args)
     end
 
-    def self.TokyoCabinet(file)
+    def self.TokyoCabinet(file = 'anemone.tch')
       require 'anemone/storage/tokyo_cabinet'
       self::TokyoCabinet.new(file)
     end
 
+    def self.MongoDB(mongo_db = nil, collection_name = 'pages')
+      require 'anemone/storage/mongodb'
+      mongo_db ||= Mongo::Connection.new.db('anemone')
+      raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
+      self::MongoDB.new(mongo_db, collection_name)
+    end
+
+    def self.Redis(opts = {})
+      require 'anemone/storage/redis'
+      self::Redis.new(opts)
+    end
+
   end
 end