diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index d1629a49..1f760b91 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -79,6 +79,7 @@ def initialize(urls, opts = {}) @skip_link_patterns = [] @after_crawl_blocks = [] @opts = opts + @stop_crawl = false yield self if block_given? end @@ -142,6 +143,18 @@ def focus_crawl(&block) self end + # + # Signals the crawler that it should stop the crawl before visiting the + # next page. + # + # This method is expected to be called within a page block, and it signals + # the crawler that it must stop after the current page is completely + # processed. All pages and links currently on queue are discared. + # + def stop_crawl + @stop_crawl = true + end + # # Perform the crawl # @@ -175,12 +188,17 @@ def run @pages[page.url] = page + if @stop_crawl + page_queue.clear + link_queue.clear + end + # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass end - if page_queue.empty? + if page_queue.empty? || @stop_crawl @tentacles.size.times { link_queue << :END } break end