Permalink
Browse files

add logger to crawler

  • Loading branch information...
1 parent 4b98924 commit d3cbbfe0eb433270a9344e475dd64bcdab7c86f8 @flyerhzm committed Sep 13, 2009
Showing with 10 additions and 1 deletion.
  1. +1 −0 README.textile
  2. 0 TODO
  3. +1 −0 lib/regexp_crawler.rb
  4. +7 −0 lib/regexp_crawler/crawler.rb
  5. +1 −1 spec/regexp_crawler_spec.rb
View
@@ -31,6 +31,7 @@ options is a hash
* <code>:headers</code>, optional, a hash to define http headers
* <code>:encoding</code>, optional, a string of the coding of crawled page, the results will be converted to utf8
* <code>:need_parse</code>, optional, a proc if parsing the page by regexp or not, the proc accept two parameters, first is the crawled website uri, second is the response body of crawled page
+* <code>:logger</code>, optional, true for logging to STDOUT, or a Logger object for logging to that logger
If the crawler define :model no :save_method, the RegexpCrawler::Crawler#start will return an array of results, such as
<pre><code>
View
0 TODO
No changes.
View
@@ -1,6 +1,7 @@
require 'net/http'
require 'uri'
require 'iconv'
+require 'logger'
require 'regexp_crawler/http'
require 'regexp_crawler/crawler'
@@ -12,6 +12,7 @@ def initialize(options = {})
@headers = options[:headers]
@encoding = options[:encoding]
@need_parse = options[:need_parse]
+ @logger = options[:logger] == true ? Logger.new(STDOUT) : options[:logger]
end
def capture_regexp=(regexp)
@@ -32,6 +33,7 @@ def start
private
def parse_page(uri)
+ @logger.debug "crawling page: #{uri.to_s}" if @logger
response = Net::HTTP.get_response_with_headers(uri, @headers)
parse_response(response, uri)
end
@@ -49,8 +51,10 @@ def continue_uri(uri, page)
def parse_response(response, uri)
response_body = encoding.nil? ? response.body : Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first
if response.is_a? Net::HTTPSuccess
+ @logger.debug "crawling success: #{uri.to_s}" if @logger
if continue_regexp
response_body.scan(continue_regexp).each do |page|
+ @logger.debug "continue_page: #{page}" if @logger
page = page.compact.first if page.is_a? Array
continue_uri = continue_uri(uri, page)
@pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
@@ -59,6 +63,7 @@ def parse_response(response, uri)
if @need_parse.nil? or @need_parse.call(uri.to_s, response_body)
md = @capture_regexp.match(response_body)
if md
+ @logger.debug "response body captured" if @logger
captures = md.captures
result = {}
captures.each_index do |i|
@@ -73,8 +78,10 @@ def parse_response(response, uri)
end
end
elsif response.is_a? Net::HTTPRedirection
+ @logger.debug "crawling redirect: #{response['location']}" if @logger
parse_page(URI.parse(response['location']))
else
+ @logger.debug "crawling nothing: #{uri.to_s}" if @logger
# do nothing
end
end
@@ -5,7 +5,7 @@
it 'should parse data according to regexp' do
success_page('/resources/simple.html', 'http://simple.com/')
- crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post')
+ crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post', :logger => true)
results = crawl.start
results.size.should == 1
results.first[:post][:title].should == 'test'

0 comments on commit d3cbbfe

Please sign in to comment.