-
Notifications
You must be signed in to change notification settings - Fork 4
/
regexp_crawler_spec.rb
122 lines (111 loc) · 5.04 KB
/
regexp_crawler_spec.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
describe RegexpCrawler::Crawler do
context '#simple html' do
it 'should parse data according to regexp' do
success_page('/resources/simple.html', 'http://simple.com/')
crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m, :named_captures => ['title', 'date', 'body'], :model => 'post', :logger => true)
results = crawl.start
results.size.should == 1
results.first[:post][:title].should == 'test'
end
it 'should redirect' do
redirect_page('http://redirect.com/', 'http://simple.com/')
success_page('/resources/simple.html', 'http://simple.com/')
end
end
context '#complex html' do
before(:each) do
success_page('/resources/complex.html', 'http://complex.com/')
success_page('/resources/nested1.html', 'http://complex.com/nested1.html')
success_page('/resources/nested2.html', 'http://complex.com/nested2.html')
end
it 'should parse data according to regexp' do
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
crawl.named_captures = ['title', 'date', 'body']
crawl.model = 'post'
results = crawl.start
results.size.should == 2
results.first[:post][:title].should == 'nested1'
results.last[:post][:title].should == 'nested2'
end
it 'should parse nested of nested data' do
success_page('/resources/nested21.html', 'http://complex.com/nested21.html')
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/?nested\d+.html}
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
crawl.named_captures = ['title', 'date', 'body']
crawl.model = 'post'
results = crawl.start
results.size.should == 3
results.first[:post][:title].should == 'nested1'
results.last[:post][:title].should == 'nested21'
end
it "should save by myself" do
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
crawl.named_captures = ['title', 'date', 'body']
my_results = []
crawl.save_method = Proc.new {|result, page| my_results << result}
results = crawl.start
results.size.should == 0
my_results.size.should == 2
end
it "should stop parse" do
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
crawl.named_captures = ['title', 'date', 'body']
stop_page = "http://complex.com/nested1.html"
parse_pages = []
crawl.save_method = Proc.new do |result, page|
if page == stop_page
false
else
parse_pages << page
end
end
results = crawl.start
parse_pages.size.should == 0
end
it 'should parse skip nested2.html' do
success_page('/resources/nested21.html', 'http://complex.com/nested21.html')
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/?nested\d+.html}
crawl.capture_regexp = %r{<div class="title">(.*?)</div>.*<div class="date">(.*?)</div>.*<div class="body">(.*?)</div>}m
crawl.named_captures = ['title', 'date', 'body']
crawl.model = 'post'
crawl.need_parse = Proc.new do |page, response_body|
if response_body.index('nested2 test html')
false
else
true
end
end
results = crawl.start
results.size.should == 2
results.first[:post][:title].should == 'nested1'
results.last[:post][:title].should == 'nested21'
end
end
def success_page(local_path, remote_path)
path = File.expand_path(File.dirname(__FILE__) + local_path)
content = File.read(path)
http = mock(Net::HTTPSuccess)
http.stubs(:is_a?).with(Net::HTTPSuccess).returns(true)
http.stubs(:body).returns(content)
Net::HTTP.expects(:get_response_with_headers).times(1).with(URI.parse(remote_path), nil).returns(http)
end
def redirect_page(remote_path, redirect_path)
http = mock(Net::HTTPRedirection)
http.stubs(:is_a?).with(Net::HTTPRedirection).returns(true)
Net::HTTP.expects(:get_response_with_headers).times(1).with(URI.parse(remote_path), nil).returns(http)
end
end