Permalink
Browse files

initial import

  • Loading branch information...
Chris Kite
Chris Kite committed Apr 14, 2009
0 parents commit 08366d4e70d679a88e3022381f0433806af11aa6
@@ -0,0 +1,19 @@
+Copyright (c) 2009 Vertive, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -0,0 +1,18 @@
+= Anemone
+
+== DESCRIPTION
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+
+== FEATURES
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects to understand a page's aliases
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+
+== REQUIREMENTS
+* hpricot
+
+== EXAMPLES
+See the +bin+ directory for several examples of useful Anemone tasks.
@@ -0,0 +1,14 @@
+spec = Gem::Specification.new do |s|
+ s.name = "anemone"
+ s.version = "0.0.1"
+ s.author = "Chris Kite"
+ s.platform = Gem::Platform::RUBY
+ s.summary = "Anemone web-spider framework"
+ s.files = Dir["{bin,lib}/**/*"] + Dir["README.txt"]
+ s.executables = %w[anemone_count.rb anemone_cron.rb anemone_pagedepth.rb anemone_serialize.rb anemone_url_list.rb]
+ s.require_path = "lib"
+ s.has_rdoc = true
+ s.rdoc_options << '-m' << 'README.txt' << '-t' << 'Anemone'
+ s.extra_rdoc_files = ["README.txt"]
+ s.add_dependency("hpricot", ">= 0.7.0")
+end
@@ -0,0 +1,31 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Crawls a site starting at the given URL, and outputs the total number
+# of unique pages on the site.
+#
+# == Usage
+# anemone_count.rb url
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'rdoc/usage'
+
+# make sure that the first option is a URL we can crawl
+begin
+ URI(ARGV[0])
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+Anemone.crawl(ARGV[0]) do |anemone|
+ anemone.after_crawl do |pages|
+ puts pages.uniq.size
+ end
+end
+
+
@@ -0,0 +1,99 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Performs pagedepth, url list, and count functionality
+# Meant to be run daily as a cron job
+#
+# == Usage
+# anemone_url_list.rb [options] url
+#
+# == Options
+# -r, --relative Output relative URLs (rather than absolute)
+# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+
+# make sure that the last option is a URL we can crawl
+begin
+ URI(ARGV.last)
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+
+root = ARGV.last
+
+Anemone.crawl(root) do |anemone|
+
+ anemone.after_crawl do |pages|
+ puts "Crawl results for #{root}\n"
+
+ # print a list of 404's
+ not_found = []
+ pages.each_value do |page|
+ url = page.url.to_s
+ not_found << url if page.not_found?
+ end
+ if !not_found.empty?
+ puts "\n404's:"
+ not_found.each do |url|
+ if options.relative
+ puts URI(url).path.to_s
+ else
+ puts url
+ end
+ num_linked_from = 0
+ pages.urls_linking_to(url).each do |u|
+ u = u.path if options.relative
+ num_linked_from += 1
+ puts " linked from #{u}"
+ if num_linked_from > 10
+ puts " ..."
+ break
+ end
+ end
+ end
+
+ print "\n"
+ end
+
+ # remove redirect aliases, and calculate pagedepths
+ pages = pages.shortest_paths!(root).uniq
+ depths = pages.values.inject({}) do |depths, page|
+ depths[page.depth] ||= 0
+ depths[page.depth] += 1
+ depths
+ end
+
+ # print the page count
+ puts "Total pages: #{pages.size}\n"
+
+ # print a list of depths
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+
+ # output a list of urls to file
+ file = open(options.output_file, 'w')
+ pages.each_key do |url|
+ url = options.relative ? url.path.to_s : url.to_s
+ file.puts url
+ end
+
+ end
+end
@@ -0,0 +1,39 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Crawls a site starting at the given URL, and outputs a count of
+# the number of Pages at each depth in the site.
+#
+# == Usage
+# anemone_pagedepth.rb url
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'rdoc/usage'
+
+# make sure that the first option is a URL we can crawl
+begin
+ URI(ARGV[0])
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+
+ anemone.after_crawl do |pages|
+ pages = pages.shortest_paths!(root).uniq
+ depths = pages.values.inject({}) do |depths, page|
+ depths[page.depth] ||= 0
+ depths[page.depth] += 1
+ depths
+ end
+
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+ end
+end
@@ -0,0 +1,43 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Crawls a site starting at the given URL, and saves the resulting
+# PageHash object to a file using Marshal serialization.
+#
+# == Usage
+# anemone_serialize.rb [options] url
+#
+# == Options
+# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+
+# make sure that the first option is a URL we can crawl
+begin
+ URI(ARGV[0])
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+ anemone.after_crawl do |pages|
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+ end
+end
@@ -0,0 +1,46 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Crawls a site starting at the given URL, and outputs the URL of each page
+# in the domain as they are encountered.
+#
+# == Usage
+# anemone_url_list.rb [options] url
+#
+# == Options
+# -r, --relative Output relative URLs (rather than absolute)
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+
+options = OpenStruct.new
+options.relative = false
+
+# make sure that the last option is a URL we can crawl
+begin
+ URI(ARGV.last)
+rescue
+ RDoc::usage()
+ Process.exit
+end
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+
+Anemone.crawl(ARGV.last) do |anemone|
+ anemone.on_every_page do |page|
+ if options.relative
+ puts page.url.path
+ else
+ puts page.url
+ end
+ end
+end
@@ -0,0 +1,2 @@
+require 'rubygems'
+require 'anemone/anemone'
@@ -0,0 +1,16 @@
+require 'anemone/core'
+
+module Anemone
+ # Version number
+ VERSION = '0.0.1'
+
+ # User-Agent string used for HTTP requests
+ USER_AGENT = "Anemone/#{self::VERSION}"
+
+ #
+ # Convenience method to start a crawl using Core
+ #
+ def Anemone.crawl(url, options = {}, &block)
+ Core.crawl(url, options, &block)
+ end
+end
Oops, something went wrong.

0 comments on commit 08366d4

Please sign in to comment.