Permalink
Browse files

Merge branch 'next'

  • Loading branch information...
2 parents a7895d0 + 6dda8e8 commit 4b378d50e7997c7125ba44f931bc3859ba447ebd @chriskite committed Jan 20, 2012
View
@@ -0,0 +1,5 @@
+*.swp
+Gemfile.lock
+test.db
+test.tch
+test.kch
View
@@ -1,6 +1,23 @@
+== 0.7.0 / 2012-01-19
+
+* Major enhancements
+
+ * Added support for SQLite3 and Kyoto Cabinet storage
+
+* Minor enhancements
+
+ * Added Page#base to use base HTML element
+ * Use bundler for development dependencies
+
+* Bug fixes
+
+ * Encode characters in URLs
+ * Fix specs to run under rake
+ * Fix handling of redirect_to in storage adapters
+
== 0.6.1 / 2011-02-24
-*Bug fixes
+* Bug fixes
* Fix a bug preventing SSL connections from working
View
@@ -2,3 +2,10 @@ Many thanks to the following folks who have contributed code to Anemone. In no p
Marc Seeger
Joost Baaij
+Laurent Arnoud
+zzzhc
+Mauro Asprea
+Alex Pooley
+polysics
+Sergey Kojin
+Richard Paul
View
@@ -0,0 +1,3 @@
+source :rubygems
+
+gemspec
View
@@ -16,7 +16,7 @@ See http://anemone.rubyforge.org for more information.
* Records response time for each page
* CLI program can list all pages in a domain, calculate page depths, and more
* Obey robots.txt
-* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis
+* In-memory or persistent storage of pages during crawl, using TokyoCabinet, SQLite3, MongoDB, or Redis
== Examples
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
@@ -32,5 +32,6 @@ To test and develop this gem, additional requirements are:
* tokyocabinet
* mongo
* redis
+* sqlite3
You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
View
@@ -1,26 +1,25 @@
require 'rubygems'
require 'rake'
+require 'rspec/core/rake_task'
-require 'spec/rake/spectask'
-Spec::Rake::SpecTask.new(:spec) do |spec|
- spec.libs << 'lib' << 'spec'
- spec.spec_files = FileList['spec/**/*_spec.rb']
+desc "Run all specs"
+RSpec::Core::RakeTask.new(:rspec) do |spec|
+ spec.pattern = 'spec/**/*_spec.rb'
end
-Spec::Rake::SpecTask.new(:rcov) do |spec|
- spec.libs << 'lib' << 'spec'
+RSpec::Core::RakeTask.new(:rcov) do |spec|
spec.pattern = 'spec/**/*_spec.rb'
spec.rcov = true
end
-task :default => :spec
+task :default => :rspec
-require 'rake/rdoctask'
-Rake::RDocTask.new do |rdoc|
+require 'rdoc/task'
+RDoc::Task.new do |rdoc|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
rdoc.rdoc_dir = 'rdoc'
rdoc.title = "anemone #{version}"
rdoc.rdoc_files.include('README*')
rdoc.rdoc_files.include('lib/**/*.rb')
-end
+end
View
@@ -1 +1 @@
-0.6.1
+0.7.0
View
@@ -1,6 +1,6 @@
spec = Gem::Specification.new do |s|
s.name = "anemone"
- s.version = "0.6.1"
+ s.version = "0.7.0"
s.author = "Chris Kite"
s.homepage = "http://anemone.rubyforge.org"
s.rubyforge_project = "anemone"
@@ -14,6 +14,16 @@ spec = Gem::Specification.new do |s|
s.add_dependency("nokogiri", ">= 1.3.0")
s.add_dependency("robots", ">= 0.7.2")
+ s.add_development_dependency "rake", ">=0.8.7"
+ s.add_development_dependency "rspec", ">=2.6.0"
+ s.add_development_dependency "fakeweb", ">=1.3.0"
+ s.add_development_dependency "redis", ">=2.2.0"
+ s.add_development_dependency "mongo", ">=1.3.1"
+ s.add_development_dependency "bson_ext", ">=1.3.1"
+ s.add_development_dependency "tokyocabinet", ">=1.29"
+ s.add_development_dependency "kyotocabinet-ruby", ">=1.27.1"
+ s.add_development_dependency "sqlite3", ">=1.3.4"
+
s.files = %w[
VERSION
LICENSE.txt
View
@@ -9,7 +9,7 @@
module Anemone
- VERSION = '0.6.1';
+ VERSION = '0.7.0';
#
# Convenience method to start a crawl
View
@@ -112,7 +112,7 @@ def get(url, referer = nil)
response, response_time = get_response(loc, referer)
code = Integer(response.code)
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
yield response, code, loc, redirect_to, response_time
limit -= 1
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
View
@@ -62,7 +62,7 @@ def links
doc.search("//a[@href]").each do |a|
u = a['href']
next if u.nil? or u.empty?
- abs = to_absolute(URI(u)) rescue next
+ abs = to_absolute(URI(URI.escape(u))) rescue next
@links << abs if in_domain?(abs)
end
@links.uniq!
@@ -132,6 +132,21 @@ def not_found?
end
#
+ # Base URI from the HTML doc head element
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
+ #
+ def base
+ @base = if doc
+ href = doc.search('//head/base/@href')
+ URI(href.to_s) unless href.nil? rescue nil
+ end unless @base
+
+ return nil if @base && @base.to_s().empty?
+ @base
+ end
+
+
+ #
# Converts relative URL *link* into an absolute URL based on the
# location of the page
#
@@ -142,7 +157,7 @@ def to_absolute(link)
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
relative = URI(link)
- absolute = @url.merge(relative)
+ absolute = base ? base.merge(relative) : @url.merge(relative)
absolute.path = '/' if absolute.path.empty?
@@ -190,7 +205,7 @@ def self.from_hash(hash)
'@visited' => hash['visited'],
'@depth' => hash['depth'].to_i,
'@referer' => hash['referer'],
- '@redirect_to' => URI(hash['redirect_to']),
+ '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
'@response_time' => hash['response_time'].to_i,
'@fetched' => hash['fetched']
}.each do |var, value|
View
@@ -18,6 +18,11 @@ def self.TokyoCabinet(file = 'anemone.tch')
self::TokyoCabinet.new(file)
end
+ def self.KyotoCabinet(file = 'anemone.tch')
+ require 'anemone/storage/kyoto_cabinet'
+ self::KyotoCabinet.new(file)
+ end
+
def self.MongoDB(mongo_db = nil, collection_name = 'pages')
require 'anemone/storage/mongodb'
mongo_db ||= Mongo::Connection.new.db('anemone')
@@ -29,6 +34,11 @@ def self.Redis(opts = {})
require 'anemone/storage/redis'
self::Redis.new(opts)
end
+
+ def self.SQLite3(file = 'anemone.db')
+ require 'anemone/storage/sqlite3'
+ self::SQLite3.new(file)
+ end
end
end
@@ -0,0 +1,72 @@
+begin
+ require 'kyotocabinet'
+rescue LoadError
+ puts $!
+ puts "You need the kyotocabinet-ruby gem to use Anemone::Storage::KyotoCabinet"
+ exit
+end
+
+require 'forwardable'
+
+module Anemone
+ module Storage
+ class KyotoCabinet
+ extend Forwardable
+
+ def_delegators :@db, :close, :size, :each
+
+ def initialize(file)
+ raise "KyotoCabinet filename must have .kch extension" if File.extname(file) != '.kch'
+ @db = ::KyotoCabinet::DB::new
+ @db.open(file, ::KyotoCabinet::DB::OWRITER | ::KyotoCabinet::DB::OCREATE)
+ @db.clear
+ end
+
+ def [](key)
+ if value = @db[key]
+ load_value(value)
+ end
+ end
+
+ def []=(key, value)
+ @db[key] = [Marshal.dump(value)].pack("m")
+ end
+
+ def each
+ @db.each do |k, v|
+ yield(k, load_value(v))
+ end
+ end
+
+ def has_key?(key)
+ # Kyoto Cabinet doesn't have a way to query whether a key exists, so hack it
+ keys = @db.match_prefix(key)
+ !!keys && keys.include?(key)
+ end
+
+ def keys
+ acc = []
+ @db.each_key { |key| acc << key.first }
+ acc
+ end
+
+ def delete(key)
+ value = self[key]
+ @db.delete(key)
+ value
+ end
+
+ def merge!(hash)
+ hash.each { |key, value| self[key] = value }
+ self
+ end
+
+ private
+
+ def load_value(value)
+ Marshal.load(value.unpack("m")[0])
+ end
+
+ end
+ end
+end
@@ -0,0 +1,90 @@
+begin
+ require 'sqlite3'
+rescue LoadError
+ puts "You need the sqlite3 gem to use Anemone::Storage::SQLite3"
+ exit
+end
+
+module Anemone
+ module Storage
+ class SQLite3
+
+ def initialize(file)
+ @db = ::SQLite3::Database.new(file)
+ create_schema
+ end
+
+ def [](url)
+ value = @db.get_first_value('SELECT data FROM anemone_storage WHERE key = ?', url.to_s)
+ if value
+ Marshal.load(value)
+ end
+ end
+
+ def []=(url, value)
+ data = Marshal.dump(value)
+ if has_key?(url)
+ @db.execute('UPDATE anemone_storage SET data = ? WHERE key = ?', data, url.to_s)
+ else
+ @db.execute('INSERT INTO anemone_storage (data, key) VALUES(?, ?)', data, url.to_s)
+ end
+ end
+
+ def delete(url)
+ page = self[url]
+ @db.execute('DELETE FROM anemone_storage WHERE key = ?', url.to_s)
+ page
+ end
+
+ def each
+ @db.execute("SELECT key, data FROM anemone_storage ORDER BY id") do |row|
+ value = Marshal.load(row[1])
+ yield row[0], value
+ end
+ end
+
+ def merge!(hash)
+ hash.each { |key, value| self[key] = value }
+ self
+ end
+
+ def size
+ @db.get_first_value('SELECT COUNT(id) FROM anemone_storage')
+ end
+
+ def keys
+ @db.execute("SELECT key FROM anemone_storage ORDER BY id").map{|t| t[0]}
+ end
+
+ def has_key?(url)
+ !!@db.get_first_value('SELECT id FROM anemone_storage WHERE key = ?', url.to_s)
+ end
+
+ def close
+ @db.close
+ end
+
+ private
+
+ def create_schema
+ @db.execute_batch <<SQL
+ create table if not exists anemone_storage (
+ id INTEGER PRIMARY KEY ASC,
+ key TEXT,
+ data BLOB
+ );
+ create index if not exists anemone_key_idx on anemone_storage (key);
+SQL
+ end
+
+ def load_page(hash)
+ BINARY_FIELDS.each do |field|
+ hash[field] = hash[field].to_s
+ end
+ Page.from_hash(hash)
+ end
+
+ end
+ end
+end
+
@@ -1,6 +1,7 @@
begin
require 'tokyocabinet'
rescue LoadError
+ puts $!
puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
exit
end
Oops, something went wrong.

0 comments on commit 4b378d5

Please sign in to comment.