Skip to content

Commit

Permalink
Merge branch 'next'
Browse files Browse the repository at this point in the history
  • Loading branch information
chriskite committed Sep 2, 2010
2 parents 7ba9e1a + 2c974c2 commit 734e3bb
Show file tree
Hide file tree
Showing 20 changed files with 550 additions and 65 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.rdoc
@@ -1,3 +1,19 @@
== 0.5.0 / 2010-09-01

* Major enhancements

* Added page storage engines for MongoDB and Redis

* Minor enhancements

* Use xpath for link parsing instead of CSS (faster) (Marc Seeger)
* Added skip_query_strings option to skip links with query strings (Joost Baaij)

* Bug fixes

* Only consider status code 300..307 a redirect (Marc Seeger)
* Canonicalize redirect links (Marc Seeger)

== 0.4.0 / 2010-04-08

* Major enchancements
Expand Down
4 changes: 4 additions & 0 deletions CONTRIBUTORS
@@ -0,0 +1,4 @@
Many thanks to the following folks who have contributed code to Anemone. In no particular order:

Marc Seeger
Joost Baaij
14 changes: 12 additions & 2 deletions README.rdoc
Expand Up @@ -8,19 +8,29 @@ See http://anemone.rubyforge.org for more information.

== Features
* Multi-threaded design for high performance
* Tracks 301 HTTP redirects to understand a page's aliases
* Tracks 301 HTTP redirects
* Built-in BFS algorithm for determining page depth
* Allows exclusion of URLs based on regular expressions
* Choose the links to follow on each page with focus_crawl()
* HTTPS support
* Records response time for each page
* CLI program can list all pages in a domain, calculate page depths, and more
* Obey robots.txt
* In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis

== Examples
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.

== Requirements
* nokogiri
* robots

== Development
To test and develop this gem, additional requirements are:
* rspec
* fakeweb
* tokyocabinet
* mongo
* redis

You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
26 changes: 26 additions & 0 deletions Rakefile
@@ -0,0 +1,26 @@
require 'rubygems'
require 'rake'

require 'spec/rake/spectask'
Spec::Rake::SpecTask.new(:spec) do |spec|
spec.libs << 'lib' << 'spec'
spec.spec_files = FileList['spec/**/*_spec.rb']
end

Spec::Rake::SpecTask.new(:rcov) do |spec|
spec.libs << 'lib' << 'spec'
spec.pattern = 'spec/**/*_spec.rb'
spec.rcov = true
end

task :default => :spec

require 'rake/rdoctask'
Rake::RDocTask.new do |rdoc|
version = File.exist?('VERSION') ? File.read('VERSION') : ""

rdoc.rdoc_dir = 'rdoc'
rdoc.title = "anemone #{version}"
rdoc.rdoc_files.include('README*')
rdoc.rdoc_files.include('lib/**/*.rb')
end
1 change: 1 addition & 0 deletions VERSION
@@ -0,0 +1 @@
0.5.0
35 changes: 5 additions & 30 deletions anemone.gemspec
@@ -1,6 +1,6 @@
spec = Gem::Specification.new do |s|
s.name = "anemone"
s.version = "0.4.0"
s.version = "0.5.0"
s.author = "Chris Kite"
s.homepage = "http://anemone.rubyforge.org"
s.rubyforge_project = "anemone"
Expand All @@ -15,37 +15,12 @@ spec = Gem::Specification.new do |s|
s.add_dependency("robots", ">= 0.7.2")

s.files = %w[
VERSION
LICENSE.txt
CHANGELOG.rdoc
README.rdoc
bin/anemone
lib/anemone.rb
lib/anemone/cookie_store.rb
lib/anemone/core.rb
lib/anemone/http.rb
lib/anemone/page.rb
lib/anemone/page_store.rb
lib/anemone/tentacle.rb
lib/anemone/storage.rb
lib/anemone/storage/pstore.rb
lib/anemone/storage/tokyo_cabinet.rb
lib/anemone/cli.rb
lib/anemone/cli/url_list.rb
lib/anemone/cli/cron.rb
lib/anemone/cli/count.rb
lib/anemone/cli/pagedepth.rb
lib/anemone/cli/serialize.rb
]
Rakefile
] + Dir['lib/**/*.rb']

s.test_files = %w[
spec/anemone_spec.rb
spec/cookie_store_spec.rb
spec/core_spec.rb
spec/page_spec.rb
spec/page_store_spec.rb
spec/http_spec.rb
spec/storage_spec.rb
spec/fakeweb_helper.rb
spec/spec_helper.rb
]
s.test_files = Dir['spec/*.rb']
end
46 changes: 38 additions & 8 deletions lib/anemone/core.rb
Expand Up @@ -2,12 +2,14 @@
require 'robots'
require 'anemone/tentacle'
require 'anemone/page'
require 'anemone/exceptions'
require 'anemone/page_store'
require 'anemone/storage'
require 'anemone/storage/base'

module Anemone

VERSION = '0.4.0';
VERSION = '0.5.0';

#
# Convenience method to start a crawl
Expand Down Expand Up @@ -45,7 +47,9 @@ class Core
# Hash of cookie name => value to send with HTTP requests
:cookies => nil,
# accept cookies from the server and send them back?
:accept_cookies => false
:accept_cookies => false,
# skip any link with a query string? e.g. http://foo.com/?u=user
:skip_query_strings => false
}

# Create setter methods for all options to be called from the crawl block
Expand Down Expand Up @@ -187,7 +191,8 @@ def run
def process_options
@opts = DEFAULT_OPTS.merge @opts
@opts[:threads] = 1 if @opts[:delay] > 0
@pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
@pages = PageStore.new(storage)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]

freeze_options
Expand Down Expand Up @@ -241,15 +246,40 @@ def links_to_follow(page)
# Returns +false+ otherwise.
#
def visit_link?(link, from_page = nil)
allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
!@pages.has_page?(link) &&
!skip_link?(link) &&
!skip_query_string?(link) &&
allowed(link) &&
!too_deep?(from_page)
end

#
# Returns +true+ if we are obeying robots.txt and the link
# is granted access in it. Always returns +true+ when we are
# not obeying robots.txt.
#
def allowed(link)
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
end

#
# Returns +true+ if we are over the page depth limit.
# This only works when coming from a page and with the +depth_limit+ option set.
# When neither is the case, will always return +false+.
def too_deep?(from_page)
if from_page && @opts[:depth_limit]
too_deep = from_page.depth >= @opts[:depth_limit]
from_page.depth >= @opts[:depth_limit]
else
too_deep = false
false
end

!@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
end

#
# Returns +true+ if *link* should not be visited because
# it has a query string and +skip_query_strings+ is true.
#
def skip_query_string?(link)
@opts[:skip_query_strings] && link.query
end

#
Expand Down
5 changes: 5 additions & 0 deletions lib/anemone/exceptions.rb
@@ -0,0 +1,5 @@
module Anemone
class Error < ::StandardError
attr_accessor :wrapped_exception
end
end
2 changes: 1 addition & 1 deletion lib/anemone/http.rb
Expand Up @@ -91,7 +91,7 @@ def get(url, referer = nil)

response, response_time = get_response(loc, referer)
code = Integer(response.code)
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
yield response, code, loc, redirect_to, response_time
limit -= 1
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
Expand Down
39 changes: 36 additions & 3 deletions lib/anemone/page.rb
Expand Up @@ -59,8 +59,8 @@ def links
@links = []
return @links if !doc

doc.css('a').each do |a|
u = a.attributes['href'].content rescue nil
doc.search("//a[@href]").each do |a|
u = a['href']
next if u.nil? or u.empty?
abs = to_absolute(URI(u)) rescue next
@links << abs if in_domain?(abs)
Expand Down Expand Up @@ -120,7 +120,7 @@ def html?
# otherwise.
#
def redirect?
(300..399).include?(@code)
(300..307).include?(@code)
end

#
Expand Down Expand Up @@ -165,5 +165,38 @@ def marshal_load(ary)
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
end

def to_hash
{'url' => @url.to_s,
'headers' => Marshal.dump(@headers),
'data' => Marshal.dump(@data),
'body' => @body,
'links' => links.map(&:to_s),
'code' => @code,
'visited' => @visited,
'depth' => @depth,
'referer' => @referer.to_s,
'redirect_to' => @redirect_to.to_s,
'response_time' => @response_time,
'fetched' => @fetched}
end

def self.from_hash(hash)
page = self.new(URI(hash['url']))
{'@headers' => Marshal.load(hash['headers']),
'@data' => Marshal.load(hash['data']),
'@body' => hash['body'],
'@links' => hash['links'].map { |link| URI(link) },
'@code' => hash['code'].to_i,
'@visited' => hash['visited'],
'@depth' => hash['depth'].to_i,
'@referer' => hash['referer'],
'@redirect_to' => URI(hash['redirect_to']),
'@response_time' => hash['response_time'].to_i,
'@fetched' => hash['fetched']
}.each do |var, value|
page.instance_variable_set(var, value)
end
page
end
end
end
19 changes: 17 additions & 2 deletions lib/anemone/storage.rb
Expand Up @@ -2,18 +2,33 @@ module Anemone
module Storage

def self.Hash(*args)
Hash.new(*args)
hash = Hash.new(*args)
# add close method for compatibility with Storage::Base
class << hash; def close; end; end
hash
end

def self.PStore(*args)
require 'anemone/storage/pstore'
self::PStore.new(*args)
end

def self.TokyoCabinet(file)
def self.TokyoCabinet(file = 'anemone.tch')
require 'anemone/storage/tokyo_cabinet'
self::TokyoCabinet.new(file)
end

def self.MongoDB(mongo_db = nil, collection_name = 'pages')
require 'anemone/storage/mongodb'
mongo_db ||= Mongo::Connection.new.db('anemone')
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
self::MongoDB.new(mongo_db, collection_name)
end

def self.Redis(opts = {})
require 'anemone/storage/redis'
self::Redis.new(opts)
end

end
end

0 comments on commit 734e3bb

Please sign in to comment.