Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

add start_page, continue_regexp and capture_regexp for crawler

  • Loading branch information...
commit 8bb3a8a150d435660351cb3274d2af167c8a7660 1 parent bc06391
@flyerhzm authored
View
1  .gitignore
@@ -0,0 +1 @@
+tmp/**
View
6 Rakefile
@@ -5,7 +5,7 @@ require 'spec/rake/spectask'
desc "Run all specs in spec directory"
Spec::Rake::SpecTask.new(:spec) do |t|
t.spec_files = FileList['spec/**/*_spec.rb']
- t.rcov = true
- t.rcov_opts = ['--exclude', 'spec,config,Library,usr/lib/ruby']
- t.rcov_dir = File.join(File.dirname(__FILE__), "tmp")
+ t.rcov = true
+ t.rcov_opts = ['--exclude', 'spec,config,Library,usr/lib/ruby']
+ t.rcov_dir = File.join(File.dirname(__FILE__), "tmp")
end
View
38 lib/regexp_crawler.rb
@@ -0,0 +1,38 @@
+require 'net/http'
+require 'uri'
+
+class RegexpCrawler
+ attr_accessor :start_page, :continue_regexp, :named_captures, :model
+
+ def capture_regexp=(regexp)
+ @capture_regexp = Regexp.new(regexp.source, regexp.options | Regexp::MULTILINE)
+ end
+
+ def start
+ pages = [start_page]
+ results = []
+ while !pages.empty?
+ uri = URI.parse(pages.shift)
+ res = Net::HTTP.get_response(uri)
+ if res.is_a? Net::HTTPSuccess
+ res.body.scan(continue_regexp).each do |page|
+ url = page.start_with?('http://') ? page : "http://#{uri.host}/#{page}"
+ pages << url
+ end if continue_regexp
+ md = @capture_regexp.match(res.body)
+ if md
+ result = model.new
+ captures = md.captures if md
+ captures.each_index do |i|
+ result.send("#{named_captures[i]}=", captures[i])
+ end
+ puts result.inspect
+ results << result
+ end
+ elsif res.is_a? Net::HTTPRedirection
+ else
+ end
+ end
+ results
+ end
+end
View
51 spec/regexp_crawler_spec.rb
@@ -0,0 +1,51 @@
+require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
+
+describe RegexpCrawler do
+ class Post
+ attr_accessor :title, :date, :body
+ end
+
+ describe '#simple html' do
+ before(:each) do
+ success_page('/resources/simple.html', 'http://simple.com/')
+ end
+
+ it 'should parse data according to regexp' do
+ crawl = RegexpCrawler.new
+ crawl.start_page = 'http://simple.com/'
+ crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
+ crawl.named_captures = ['title', 'date', 'body']
+ crawl.model = Post
+ results = crawl.start
+ results.size.should == 1
+ end
+ end
+
+ describe '#complex html' do
+ before(:each) do
+ success_page('/resources/complex.html', 'http://complex.com/')
+ success_page('/resources/nested1.html', 'http://complex.com/nested1.html')
+ success_page('/resources/nested2.html', 'http://complex.com/nested2.html')
+ end
+
+ it 'should parse data according to regexp' do
+ crawl = RegexpCrawler.new
+ crawl.start_page = 'http://complex.com/'
+ crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html}
+ crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
+ crawl.named_captures = ['title', 'date', 'body']
+ crawl.model = Post
+ results = crawl.start
+ results.size.should == 2
+ end
+ end
+
+ def success_page(local_path, remote_path)
+ path = File.expand_path(File.dirname(__FILE__) + local_path)
+ content = File.read(path)
+ http = mock(Net::HTTPSuccess)
+ http.stubs(:is_a?).with(Net::HTTPSuccess).returns(true)
+ http.stubs(:body).returns(content)
+ Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http)
+ end
+end
View
11 spec/resources/complex.html
@@ -0,0 +1,11 @@
+<html>
+ <head>
+ <title>complex test html</title>
+ </head>
+ <body>
+ <div>
+ <a link="nested1.html">nested1</a>
+ <a link="http://complex.com/nested2.html">nested2</a>
+ </div>
+ </body>
+</html>
View
12 spec/resources/nested1.html
@@ -0,0 +1,12 @@
+<html>
+ <head>
+ <title>nested1 test html</title>
+ </head>
+ <body>
+ <div>
+ <div class="title">nested1</div>
+ <div class="date">2008/10/10</div>
+ <div class="body"><p class="content">nested1</p></div>
+ </div>
+ </body>
+</html>
View
12 spec/resources/nested2.html
@@ -0,0 +1,12 @@
+<html>
+ <head>
+ <title>nested2 test html</title>
+ </head>
+ <body>
+ <div>
+ <div class="title">nested2</div>
+ <div class="date">2008/10/10</div>
+ <div class="body"><p class="content">nested2</p></div>
+ </div>
+ </body>
+</html>
View
12 spec/resources/simple.html
@@ -0,0 +1,12 @@
+<html>
+ <head>
+ <title>simple test html</title>
+ </head>
+ <body>
+ <div>
+ <div class="title">test</div>
+ <div class="date">2008/09/10</div>
+ <div class="body"><p class="content">test</p></div>
+ </div>
+ </body>
+</html>
View
1  spec/spec_helper.rb
@@ -1,5 +1,6 @@
require 'rubygems'
require 'spec/autorun'
require 'date'
+require 'mocha'
require File.join(File.dirname(__FILE__), '/../lib/regexp_crawler.rb')
Please sign in to comment.
Something went wrong with that request. Please try again.