Skip to content
Browse files

initial import

  • Loading branch information...
0 parents commit 08366d4e70d679a88e3022381f0433806af11aa6 Chris Kite committed
19 LICENSE.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2009 Vertive, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
18 README.txt
@@ -0,0 +1,18 @@
+= Anemone
+
+== DESCRIPTION
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+
+== FEATURES
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects to understand a page's aliases
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+
+== REQUIREMENTS
+* hpricot
+
+== EXAMPLES
+See the +bin+ directory for several examples of useful Anemone tasks.
14 anemone.gemspec
@@ -0,0 +1,14 @@
+spec = Gem::Specification.new do |s|
+ s.name = "anemone"
+ s.version = "0.0.1"
+ s.author = "Chris Kite"
+ s.platform = Gem::Platform::RUBY
+ s.summary = "Anemone web-spider framework"
+ s.files = Dir["{bin,lib}/**/*"] + Dir["README.txt"]
+ s.executables = %w[anemone_count.rb anemone_cron.rb anemone_pagedepth.rb anemone_serialize.rb anemone_url_list.rb]
+ s.require_path = "lib"
+ s.has_rdoc = true
+ s.rdoc_options << '-m' << 'README.txt' << '-t' << 'Anemone'
+ s.extra_rdoc_files = ["README.txt"]
+ s.add_dependency("hpricot", ">= 0.7.0")
+end
31 bin/anemone_count.rb
@@ -0,0 +1,31 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Crawls a site starting at the given URL, and outputs the total number
+# of unique pages on the site.
+#
+# == Usage
+# anemone_count.rb url
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'rdoc/usage'
+
+# make sure that the first option is a URL we can crawl
+begin
+ URI(ARGV[0])
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+Anemone.crawl(ARGV[0]) do |anemone|
+ anemone.after_crawl do |pages|
+ puts pages.uniq.size
+ end
+end
+
+
99 bin/anemone_cron.rb
@@ -0,0 +1,99 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Performs pagedepth, url list, and count functionality
+# Meant to be run daily as a cron job
+#
+# == Usage
+# anemone_url_list.rb [options] url
+#
+# == Options
+# -r, --relative Output relative URLs (rather than absolute)
+# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+
+# make sure that the last option is a URL we can crawl
+begin
+ URI(ARGV.last)
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+
+root = ARGV.last
+
+Anemone.crawl(root) do |anemone|
+
+ anemone.after_crawl do |pages|
+ puts "Crawl results for #{root}\n"
+
+ # print a list of 404's
+ not_found = []
+ pages.each_value do |page|
+ url = page.url.to_s
+ not_found << url if page.not_found?
+ end
+ if !not_found.empty?
+ puts "\n404's:"
+ not_found.each do |url|
+ if options.relative
+ puts URI(url).path.to_s
+ else
+ puts url
+ end
+ num_linked_from = 0
+ pages.urls_linking_to(url).each do |u|
+ u = u.path if options.relative
+ num_linked_from += 1
+ puts " linked from #{u}"
+ if num_linked_from > 10
+ puts " ..."
+ break
+ end
+ end
+ end
+
+ print "\n"
+ end
+
+ # remove redirect aliases, and calculate pagedepths
+ pages = pages.shortest_paths!(root).uniq
+ depths = pages.values.inject({}) do |depths, page|
+ depths[page.depth] ||= 0
+ depths[page.depth] += 1
+ depths
+ end
+
+ # print the page count
+ puts "Total pages: #{pages.size}\n"
+
+ # print a list of depths
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+
+ # output a list of urls to file
+ file = open(options.output_file, 'w')
+ pages.each_key do |url|
+ url = options.relative ? url.path.to_s : url.to_s
+ file.puts url
+ end
+
+ end
+end
39 bin/anemone_pagedepth.rb
@@ -0,0 +1,39 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Crawls a site starting at the given URL, and outputs a count of
+# the number of Pages at each depth in the site.
+#
+# == Usage
+# anemone_pagedepth.rb url
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'rdoc/usage'
+
+# make sure that the first option is a URL we can crawl
+begin
+ URI(ARGV[0])
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+
+ anemone.after_crawl do |pages|
+ pages = pages.shortest_paths!(root).uniq
+ depths = pages.values.inject({}) do |depths, page|
+ depths[page.depth] ||= 0
+ depths[page.depth] += 1
+ depths
+ end
+
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+ end
+end
43 bin/anemone_serialize.rb
@@ -0,0 +1,43 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Crawls a site starting at the given URL, and saves the resulting
+# PageHash object to a file using Marshal serialization.
+#
+# == Usage
+# anemone_serialize.rb [options] url
+#
+# == Options
+# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+
+# make sure that the first option is a URL we can crawl
+begin
+ URI(ARGV[0])
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+ anemone.after_crawl do |pages|
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+ end
+end
46 bin/anemone_url_list.rb
@@ -0,0 +1,46 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Crawls a site starting at the given URL, and outputs the URL of each page
+# in the domain as they are encountered.
+#
+# == Usage
+# anemone_url_list.rb [options] url
+#
+# == Options
+# -r, --relative Output relative URLs (rather than absolute)
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+
+options = OpenStruct.new
+options.relative = false
+
+# make sure that the last option is a URL we can crawl
+begin
+ URI(ARGV.last)
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+
+Anemone.crawl(ARGV.last) do |anemone|
+ anemone.on_every_page do |page|
+ if options.relative
+ puts page.url.path
+ else
+ puts page.url
+ end
+ end
+end
2 lib/anemone.rb
@@ -0,0 +1,2 @@
+require 'rubygems'
+require 'anemone/anemone'
16 lib/anemone/anemone.rb
@@ -0,0 +1,16 @@
+require 'anemone/core'
+
+module Anemone
+ # Version number
+ VERSION = '0.0.1'
+
+ # User-Agent string used for HTTP requests
+ USER_AGENT = "Anemone/#{self::VERSION}"
+
+ #
+ # Convenience method to start a crawl using Core
+ #
+ def Anemone.crawl(url, options = {}, &block)
+ Core.crawl(url, options, &block)
+ end
+end
183 lib/anemone/core.rb
@@ -0,0 +1,183 @@
+require 'net/http'
+require 'thread'
+require 'anemone/tentacle'
+require 'anemone/page_hash'
+
+module Anemone
+ class Core
+ # PageHash storing all Page objects encountered during the crawl
+ attr_reader :pages
+
+ #
+ # Initialize the crawl with a starting *url*, *options*, and optional *block*
+ #
+ def initialize(url, options={}, &block)
+ url = URI(url) if url.is_a?(String)
+ @url = url
+ @options = options
+ @tentacles = []
+ @pages = PageHash.new
+ @on_every_page_blocks = []
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+ @skip_link_patterns = []
+ @after_crawl_blocks = []
+
+ @options[:threads] ||= 4
+ @options[:verbose] ||= false
+
+ block.call(self) if block
+ end
+
+ #
+ # Convenience method to start a new crawl
+ #
+ def self.crawl(root, options={}, &block)
+ self.new(root, options) do |core|
+ block.call(core) if block
+ core.run
+ core.do_after_crawl_blocks
+ return core
+ end
+ end
+
+ #
+ # Add a block to be executed on the PageHash after the crawl
+ # is finished
+ #
+ def after_crawl(&block)
+ @after_crawl_blocks << block
+ self
+ end
+
+ #
+ # Add one ore more Regex patterns for URLs which should not be
+ # followed
+ #
+ def skip_links_like(*patterns)
+ if patterns
+ patterns.each do |pattern|
+ @skip_link_patterns << pattern
+ end
+ end
+ self
+ end
+
+ #
+ # Add a block to be executed on every Page as they are encountered
+ # during the crawl
+ #
+ def on_every_page(&block)
+ @on_every_page_blocks << block
+ self
+ end
+
+ #
+ # Add a block to be executed on Page objects with a URL matching
+ # one or more patterns
+ #
+ def on_pages_like(*patterns, &block)
+ if patterns
+ patterns.each do |pattern|
+ @on_pages_like_blocks[pattern] << block
+ end
+ end
+ self
+ end
+
+ #
+ # Perform the crawl
+ #
+ def run
+ link_queue = Queue.new
+ page_queue = Queue.new
+
+ @options[:threads].times do |id|
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
+ end
+
+ return if !visit_link?(@url)
+
+ link_queue.enq(@url)
+
+ while true do
+ page = page_queue.deq
+
+ @pages[page.url] = page
+
+ puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
+
+ do_page_blocks(page)
+
+ page.links.each do |link|
+ if visit_link?(link)
+ link_queue.enq(link)
+ @pages[link] = nil
+ end
+ end
+
+ page.aliases.each do |aka|
+ if !@pages.has_key?(aka) or @pages[aka].nil?
+ @pages[aka] = page.alias_clone(aka)
+ end
+ @pages[aka].add_alias!(page.url)
+ end
+
+ # if we are done with the crawl, tell the threads to end
+ if link_queue.empty? and page_queue.empty?
+ until link_queue.num_waiting == @tentacles.size
+ Thread.pass
+ end
+
+ if page_queue.empty?
+ @tentacles.size.times { |i| link_queue.enq(:END)}
+ break
+ end
+ end
+
+ end
+
+ @tentacles.each { |t| t.join }
+
+ self
+ end
+
+ #
+ # Execute the after_crawl blocks
+ #
+ def do_after_crawl_blocks
+ @after_crawl_blocks.each {|b| b.call(@pages)}
+ end
+
+ #
+ # Execute the on_every_page blocks for *page*
+ #
+ def do_page_blocks(page)
+ @on_every_page_blocks.each do |blk|
+ blk.call(page)
+ end
+
+ @on_pages_like_blocks.each do |pattern, blk|
+ blk.call(page) if page.url.to_s =~ pattern
+ end
+ end
+
+ #
+ # Returns +true+ if *link* has not been visited already,
+ # and is not excluded by a skip_link pattern. Returns
+ # +false+ otherwise.
+ #
+ def visit_link?(link)
+ !@pages.has_key?(link) and !skip_link?(link)
+ end
+
+ #
+ # Returns +true+ if *link* should not be visited because
+ # its URL matches a skip_link pattern.
+ #
+ def skip_link?(link)
+ @skip_link_patterns.each { |p| return true if link.path =~ p}
+ return false
+ end
+
+ end
+end
37 lib/anemone/http.rb
@@ -0,0 +1,37 @@
+require 'net/http'
+
+module Anemone
+ class HTTP < Net::HTTP
+ # Maximum number of redirects to follow on each get_response
+ REDIRECTION_LIMIT = 5
+
+ #
+ # Retrieve an HTTP response for *url*, following redirects.
+ # Returns the response object, response code, and final URI location.
+ #
+ def self.get(url)
+ response = get_response(url)
+ code = Integer(response.code)
+ loc = url
+
+ limit = REDIRECTION_LIMIT
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
+ loc = URI(response['location'])
+ loc = url.merge(loc) if loc.relative?
+ response = get_response(loc)
+ limit -= 1
+ end
+
+ return response, code, loc
+ end
+
+ #
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
+ #
+ def self.get_response(url)
+ Net::HTTP.start(url.host, url.port) do |http|
+ return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
+ end
+ end
+ end
+end
165 lib/anemone/page.rb
@@ -0,0 +1,165 @@
+require 'anemone/http'
+require 'hpricot'
+
+module Anemone
+ class Page
+ # The URL of the page
+ attr_reader :url
+ # Array of distinct A tag HREFs from the page
+ attr_reader :links
+ # Integer response code of the page
+ attr_reader :code
+
+ # Array of redirect-aliases for the page
+ attr_accessor :aliases
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
+ attr_accessor :visited
+ # Used by PageHash#shortest_paths! to store depth of the page
+ attr_accessor :depth
+
+ #
+ # Create a new Page from the response of an HTTP request to *url*
+ #
+ def self.fetch(url)
+ begin
+ url = URI(url) if url.is_a?(String)
+
+ response, code, location = Anemone::HTTP.get(url)
+
+ aka = nil
+ if !url.eql?(location)
+ aka = location
+ end
+
+ return Page.new(url, response, code, aka)
+ rescue
+ return Page.new(url)
+ end
+ end
+
+ #
+ # Create a new page
+ #
+ def initialize(url, response = nil, code = nil, aka = nil)
+ @url = url
+ @response = response
+ @code = code
+ @links = []
+ @aliases = []
+
+ @aliases << aka if !aka.nil?
+
+ #get a list of distinct links on the page, in absolute url form
+ if @response and @response.body
+ Hpricot(@response.body).search('a').each do |a|
+ u = a['href']
+ next if u.nil?
+
+ begin
+ u = URI(u)
+ rescue
+ next
+ end
+
+ abs = to_absolute(u)
+ @links << abs if in_domain?(abs)
+ end
+
+ @links.uniq!
+ end
+ end
+
+
+ #
+ # Return a new page with the same *response* and *url*, but
+ # with a 200 response code
+ #
+ def alias_clone(url)
+ Page.new(url, @response, 200, @url)
+ end
+
+ #
+ # Add a redirect-alias String *aka* to the list of the page's aliases
+ #
+ # Returns *self*
+ #
+ def add_alias!(aka)
+ @aliases << aka if !@aliases.include?(aka)
+ self
+ end
+
+ #
+ # Returns an Array of all links from this page, and all the
+ # redirect-aliases of those pages, as String objects.
+ #
+ # *page_hash* is a PageHash object with the results of the current crawl.
+ #
+ def links_and_their_aliases(page_hash)
+ @links.inject([]) do |results, link|
+ results.concat([link].concat(page_hash[link].aliases))
+ end
+ end
+
+ #
+ # Returns the response body for the page
+ #
+ def body
+ @response.body
+ end
+
+ #
+ # Returns the +Content-Type+ header for the page
+ #
+ def content_type
+ @response['Content-Type']
+ end
+
+ #
+ # Returns +true+ if the page is a HTML document, returns +false+
+ # otherwise.
+ #
+ def html?
+ (content_type =~ /text\/html/) == 0
+ end
+
+ #
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
+ # otherwise.
+ #
+ def redirect?
+ (300..399).include?(@code)
+ end
+
+ #
+ # Returns +true+ if the page was not found (returned 404 code),
+ # returns +false+ otherwise.
+ #
+ def not_found?
+ 404 == @code
+ end
+
+ #
+ # Converts relative URL *link* into an absolute URL based on the
+ # location of the page
+ #
+ def to_absolute(link)
+ # remove anchor
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
+
+ relative = URI(link)
+ absolute = @url.merge(relative)
+
+ absolute.path = '/' if absolute.path.empty?
+
+ return absolute
+ end
+
+ #
+ # Returns +true+ if *uri* is in the same domain as the page, returns
+ # +false+ otherwise
+ #
+ def in_domain?(uri)
+ uri.host == @url.host
+ end
+ end
+end
83 lib/anemone/page_hash.rb
@@ -0,0 +1,83 @@
+module Anemone
+ class PageHash < Hash
+
+ #
+ # Use a breadth-first search to calculate the single-source
+ # shortest paths from *root* to all pages in the PageHash
+ #
+ def shortest_paths!(root)
+ root = URI(root) if root.is_a?(String)
+ raise "Root node not found" if !has_key?(root)
+
+ each_value {|p| p.visited = false if p}
+
+ q = Queue.new
+
+ q.enq(root)
+ self[root].depth = 0
+ self[root].visited = true
+ while(!q.empty?)
+ url = q.deq
+
+ next if !has_key?(url)
+
+ page = self[url]
+
+ page.links.each do |u|
+ next if !has_key?(u) or self[u].nil?
+ link = self[u]
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
+
+ aliases.each do |node|
+ if node.depth.nil? or page.depth + 1 < node.depth
+ node.depth = page.depth + 1
+ end
+ end
+
+ q.enq(self[u].url) if !self[u].visited
+ self[u].visited = true
+ end
+ end
+
+ self
+ end
+
+ #
+ # Returns a new PageHash by removing redirect-aliases for each
+ # non-redirect Page
+ #
+ def uniq
+ results = PageHash.new
+ each do |url, page|
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
+ if !page.redirect? and !page_added
+ results[url] = page.clone
+ results[url].aliases = []
+ end
+ end
+
+ results
+ end
+
+ #
+ # Return an Array of Page objects which link to the given url
+ #
+ def pages_linking_to url
+ begin
+ url = URI(url) if url.is_a?(String)
+ rescue
+ return []
+ end
+
+ values.delete_if { |p| !p.links.include?(url) }
+ end
+
+ #
+ # Return an Array of URI objects of Pages linking to the given url
+ def urls_linking_to url
+ pages_linking_to(url).map{|p| p.url}
+ end
+
+ end
+end
31 lib/anemone/tentacle.rb
@@ -0,0 +1,31 @@
+require 'anemone/page'
+
+module Anemone
+ class Tentacle
+
+ #
+ # Create a new Tentacle
+ #
+ def initialize(link_queue, page_queue)
+ @link_queue = link_queue
+ @page_queue = page_queue
+ end
+
+ #
+ # Gets links from @link_queue, and returns the fetched
+ # Page objects into @page_queue
+ #
+ def run
+ while true do
+ link = @link_queue.deq
+
+ break if link == :END
+
+ page = Page.fetch(link)
+
+ @page_queue.enq(page)
+ end
+ end
+
+ end
+end

0 comments on commit 08366d4

Please sign in to comment.
Something went wrong with that request. Please try again.