/
crawler_spec.rb
145 lines (114 loc) · 3.81 KB
/
crawler_spec.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
require 'spec_helper'
describe Wombat::Crawler do
before(:each) do
@crawler = Class.new
@crawler.send(:include, Wombat::Crawler)
@crawler_instance = @crawler.new
end
it 'should call the provided block' do
event_called = false
@crawler.event { event_called = true }
event_called.should be_true
end
it 'should provide metadata to yielded block' do
@crawler.event do |e|
e.should_not be_nil
end
end
it 'should store assigned metadata information' do
time = Time.now
@crawler.event do |e|
e.title 'Fulltronic Dezembro'
e.time Time.now
end
@crawler.venue { |v| v.name "Scooba" }
@crawler.location { |v| v.latitude -50.2323 }
@crawler_instance.should_receive(:parse) do |arg|
arg["event"]["title"].selector.should == "Fulltronic Dezembro"
arg["event"]["time"].selector.to_s.should == time.to_s
arg["venue"]["name"].selector.should == "Scooba"
arg["location"]["latitude"].selector.should == -50.2323
end
@crawler_instance.crawl
end
it 'should isolate metadata between different instances' do
another_crawler = Class.new
another_crawler.send(:include, Wombat::Crawler)
another_crawler_instance = another_crawler.new
another_crawler.event { |e| e.title 'Ibiza' }
another_crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Ibiza" }
another_crawler_instance.crawl
@crawler.event { |e| e.title 'Fulltronic Dezembro' }
@crawler_instance.should_receive(:parse) { |arg| arg["event"]["title"].selector.should == "Fulltronic Dezembro" }
@crawler_instance.crawl
end
it 'should be able to assign arbitrary plain text metadata' do
@crawler.some_data("/event/list", :html, "geo") { |p| true }
@crawler_instance.should_receive(:parse) do |arg|
prop = arg['some_data']
prop.name.should == "some_data"
prop.selector.should == "/event/list"
prop.format.should == :html
prop.namespaces.should == "geo"
prop.callback.should_not be_nil
end
@crawler_instance.crawl
end
it 'should be able to specify arbitrary block structure more than once' do
@crawler.structure do |s|
s.data "xpath=/xyz"
end
@crawler.structure do |s|
s.another "css=.information"
end
@crawler_instance.should_receive(:parse) do |arg|
arg["structure"]["data"].selector.should == "xpath=/xyz"
arg["structure"]["another"].selector.should == "css=.information"
end
@crawler_instance.crawl
end
it 'should not explode if no block given' do
@crawler.event
end
it 'should iterate on elements inside for_each block' do
@crawler.for_each "css=.element" do
title "css=.title"
body "css=.body"
event do |e|
e.all "yeah"
end
end
@crawler_instance.should_receive(:parse) do |arg|
it = arg.iterators.first
it.selector.should == "css=.element"
it["title"].selector.should == "css=.title"
it["body"].selector.should == "css=.body"
it["event"]["all"].selector.should == "yeah"
end
@crawler_instance.crawl
end
it 'should assign metadata format' do
@crawler_instance.should_receive(:parse) do |arg|
arg[:format].should == :xml
end
@crawler.format :xml
@crawler_instance.crawl
end
it 'should crawl with block' do
@crawler.base_url "danielnc.com"
@crawler.list_page "/itens"
@crawler_instance.should_receive(:parse) do |arg|
arg[:base_url].should == "danielnc.com"
arg[:list_page].should == "/itens/1"
end
@crawler_instance.crawl do
list_page "/itens/1"
end
another_instance = @crawler.new
another_instance.should_receive(:parse) do |arg|
arg[:base_url].should == "danielnc.com"
arg[:list_page].should == "/itens"
end
another_instance.crawl
end
end