/
test_sanitizer.rb
executable file
·408 lines (350 loc) · 17.4 KB
/
test_sanitizer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#
# these tests taken from the HTML5 sanitization project and modified for use with Loofah
# see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
#
# license text at the bottom of this file
#
require "helper"
class Html5TestSanitizer < Loofah::TestCase
include Loofah
def sanitize_xhtml stream
Loofah.fragment(stream).scrub!(:escape).to_xhtml
end
def sanitize_html stream
Loofah.fragment(stream).scrub!(:escape).to_html
end
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
sane = sanitize_html(input).gsub('"',"'")
htmloutput = htmloutput.gsub('"',"'")
xhtmloutput = xhtmloutput.gsub('"',"'")
rexmloutput = rexmloutput.gsub('"',"'")
## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
## it would require a lot of manual hacking to make the tests match libxml's output.
## instead, I'm taking the shotgun approach, and trying to match any of the described outputs.
assert((htmloutput == sane) || (rexmloutput == sane) || (xhtmloutput == sane),
%Q{given: "#{input}"\nexpected: "#{htmloutput}"\ngot: "#{sane}"})
end
def assert_completes_in_reasonable_time &block
t0 = Time.now
block.call
assert_in_delta t0, Time.now, 0.1 # arbitrary seconds
end
(HTML5::SafeList::ALLOWED_ELEMENTS).each do |tag_name|
define_method "test_should_allow_#{tag_name}_tag" do
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
rexmloutput = xhtmloutput
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
htmloutput = "foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
elsif tag_name == 'col'
htmloutput = "<col title='1'>foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
rexmloutput = "<col title='1' />"
elsif tag_name == 'table'
htmloutput = "foo <bad>bar</bad>baz<table title='1'> </table>"
xhtmloutput = htmloutput
elsif tag_name == 'image'
htmloutput = "<img title='1'/>foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
rexmloutput = "<image title='1'>foo <bad>bar</bad> baz</image>"
elsif HTML5::SafeList::VOID_ELEMENTS.include?(tag_name)
htmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
htmloutput += '<br/>' if tag_name == 'br'
rexmloutput = "<#{tag_name} title='1' />"
end
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
end
end
##
## libxml2 downcases elements, so this is moot.
##
# HTML5::SafeList::ALLOWED_ELEMENTS.each do |tag_name|
# define_method "test_should_forbid_#{tag_name.upcase}_tag" do
# input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
# output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
# check_sanitization(input, output, output, output)
# end
# end
HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
next if attribute_name == 'style'
define_method "test_should_allow_#{attribute_name}_attribute" do
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
if %w[checked compact disabled ismap multiple nohref noshade nowrap readonly selected].include?(attribute_name)
output = "<p #{attribute_name}>foo <bad>bar</bad> baz</p>"
htmloutput = "<p #{attribute_name.downcase}>foo <bad>bar</bad> baz</p>"
else
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
end
check_sanitization(input, htmloutput, output, output)
end
end
def test_should_allow_data_attributes
input = "<p data-foo='foo'>foo <bad>bar</bad> baz</p>"
output = "<p data-foo='foo'>foo <bad>bar</bad> baz</p>"
htmloutput = "<p data-foo='foo'>foo <bad>bar</bad> baz</p>"
check_sanitization(input, htmloutput, output, output)
end
def test_should_allow_multi_word_data_attributes
input = "<p data-foo-bar-id='11'>foo <bad>bar</bad> baz</p>"
output = htmloutput = "<p data-foo-bar-id='11'>foo <bad>bar</bad> baz</p>"
check_sanitization(input, htmloutput, output, output)
end
def test_should_allow_contenteditable
input = '<p contenteditable="false">Hi!</p>'
output = '<p contenteditable="false">Hi!</p>'
check_sanitization(input, output, output, output)
end
##
## libxml2 downcases attributes, so this is moot.
##
# HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
# define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
# input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
# output = "<p>foo <bad>bar</bad> baz</p>"
# check_sanitization(input, output, output, output)
# end
# end
HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
input = %(<a href="#{protocol}">foo</a>)
output = "<a href='#{protocol}'>foo</a>"
check_sanitization(input, output, output, output)
end
end
HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
input = %(<a href="#{protocol.upcase}">foo</a>)
output = "<a href='#{protocol.upcase}'>foo</a>"
check_sanitization(input, output, output, output)
end
end
HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
define_method "test_should_allow_data_#{data_uri_type}_uris" do
input = %(<a href="data:#{data_uri_type}">foo</a>)
output = "<a href='data:#{data_uri_type}'>foo</a>"
check_sanitization(input, output, output, output)
input = %(<a href="data:#{data_uri_type};base64,R0lGODlhAQABA">foo</a>)
output = "<a href='data:#{data_uri_type};base64,R0lGODlhAQABA'>foo</a>"
check_sanitization(input, output, output, output)
end
end
HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
define_method "test_should_allow_uppercase_data_#{data_uri_type}_uris" do
input = %(<a href="DATA:#{data_uri_type.upcase}">foo</a>)
output = "<a href='DATA:#{data_uri_type.upcase}'>foo</a>"
check_sanitization(input, output, output, output)
end
end
def test_should_disallow_other_uri_mediatypes
input = %(<a href="data:foo">foo</a>)
output = "<a>foo</a>"
check_sanitization(input, output, output, output)
input = %(<a href="data:image/xxx">foo</a>)
output = "<a>foo</a>"
check_sanitization(input, output, output, output)
input = %(<a href="data:image/xxx;base64,R0lGODlhAQABA">foo</a>)
output = "<a>foo</a>"
check_sanitization(input, output, output, output)
end
HTML5::SafeList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
next unless HTML5::SafeList::ALLOWED_ELEMENTS.include?(tag_name)
define_method "test_#{tag_name}_should_allow_local_href" do
input = %(<#{tag_name} xlink:href="#foo"/>)
output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
input = %(<#{tag_name} xlink:href="\n#foo"/>)
output = "<#{tag_name.downcase} xlink:href='\n#foo'></#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name}></#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name}></#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
end
def test_figure_element_is_valid
fragment = Loofah.scrub_fragment("<span>hello</span> <figure>asd</figure>", :prune)
assert fragment.at_css("figure"), "<figure> tag was scrubbed"
end
##
## as tenderlove says, "care < 0"
##
# def test_should_handle_astral_plane_characters
# input = "<p>𝒵 𝔸</p>"
# output = "<p>\360\235\222\265 \360\235\224\270</p>"
# check_sanitization(input, output, output, output)
# input = "<p><tspan>\360\235\224\270</tspan> a</p>"
# output = "<p><tspan>\360\235\224\270</tspan> a</p>"
# check_sanitization(input, output, output, output)
# end
# This affects only NS4. Is it worth fixing?
# def test_javascript_includes
# input = %(<div size="&{alert('XSS')}">foo</div>)
# output = "<div>foo</div>"
# check_sanitization(input, output, output, output)
# end
##
## these tests primarily test the parser logic, not the sanitizer
## logic. i call bullshit. we're not writing a test suite for
## libxml2 here, so let's rely on the unit tests above to take care
## of our valid elements and attributes.
##
require 'json'
Dir[File.join(File.dirname(__FILE__), '..', 'assets', 'testdata_sanitizer_tests1.dat')].each do |filename|
JSON::parse(open(filename).read).each do |test|
it "testdata sanitizer #{test['name']}" do
check_sanitization(
test['input'],
test['output'],
test['xhtml'] || test['output'],
test['rexml'] || test['output']
)
end
end
end
## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
HTML5::SafeList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
input = "<rect fill='url(#foo)' />"
output = "<rect fill='url(#foo)'></rect>"
check_sanitization(input, output, output, output)
end
define_method "test_absolute_uri_refs_in_svg_attribute_#{attr_name}" do
input = "<rect fill='url(http://bad.com/) #fff' />"
output = "<rect fill=' #fff'></rect>"
check_sanitization(input, output, output, output)
end
end
def test_css_list_style
html = '<ul style="list-style: none"></ul>'
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
assert_match %r/list-style/, sane.inner_html
end
def test_css_negative_value_sanitization
html = "<span style=\"letter-spacing:-0.03em;\">"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
assert_match %r/-0.03em/, sane.inner_html
end
def test_css_negative_value_sanitization_shorthand_css_properties
html = "<span style=\"margin-left:-0.05em;\">"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
assert_match %r/-0.05em/, sane.inner_html
end
def test_css_high_precision_value_shorthand_css_properties
html = "<span style=\"margin-left:0.3333333334em;\">"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
assert_match %r/0.3333333334em/, sane.inner_html
end
def test_css_function_sanitization_leaves_safelisted_functions_calc
html = "<span style=\"width:calc(5%)\">"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
assert_match %r/calc\(5%\)/, sane.inner_html
html = "<span style=\"width: calc(5%)\">"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
assert_match %r/calc\(5%\)/, sane.inner_html
end
def test_css_function_sanitization_leaves_safelisted_functions_rgb
html = '<span style="color: rgb(255, 0, 0)">'
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
assert_match %r/rgb\(255, 0, 0\)/, sane.inner_html
end
def test_css_function_sanitization_leaves_safelisted_list_style_type
html = "<ol style='list-style-type:lower-greek;'></ol>"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
assert_match %r/list-style-type:lower-greek/, sane.inner_html
end
def test_css_function_sanitization_strips_style_attributes_with_unsafe_functions
html = "<span style=\"width:url(data-evil-url)\">"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
assert_match %r/<span><\/span>/, sane.inner_html
html = "<span style=\"width: url(data-evil-url)\">"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
assert_match %r/<span><\/span>/, sane.inner_html
end
def test_css_max_width
html = '<div style="max-width: 100%;"></div>'
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
assert_match %r/max-width/, sane.inner_html
end
def test_issue_90_slow_regex
skip("timing tests are hard to make pass and have little regression-testing value")
html = %q{<span style="background: url('data:image/svg+xml;charset=utf-8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2232%22%20height%3D%2232%22%20viewBox%3D%220%200%2032%2032%22%3E%3Cpath%20fill%3D%22%23D4C8AE%22%20d%3D%22M0%200h32v32h-32z%22%2F%3E%3Cpath%20fill%3D%22%2383604B%22%20d%3D%22M0%200h31.99v11.75h-31.99z%22%2F%3E%3Cpath%20fill%3D%22%233D2319%22%20d%3D%22M0%2011.5h32v.5h-32z%22%2F%3E%3Cpath%20fill%3D%22%23F83651%22%20d%3D%22M5%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23FCD050%22%20d%3D%22M6%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%2371C797%22%20d%3D%22M7%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23509CF9%22%20d%3D%22M8%200h1v10.5h-1z%22%2F%3E%3ClinearGradient%20id%3D%22a%22%20gradientUnits%3D%22userSpaceOnUse%22%20x1%3D%2224.996%22%20y1%3D%2210.5%22%20x2%3D%2224.996%22%20y2%3D%224.5%22%3E%3Cstop%20offset%3D%220%22%20stop-color%3D%22%23796055%22%2F%3E%3Cstop%20offset%3D%22.434%22%20stop-color%3D%22%23614C43%22%2F%3E%3Cstop%20offset%3D%221%22%20stop-color%3D%22%233D2D28%22%2F%3E%3C%2FlinearGradient%3E%3Cpath%20fill%3D%22url(%23a)%22%20d%3D%22M28%208.5c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3Cpath%20fill%3D%22%235F402E%22%20d%3D%22M28%208c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3C');"></span>}
assert_completes_in_reasonable_time {
Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
}
end
def test_upper_case_css_property
html = "<div style=\"COLOR: BLUE; NOTAPROPERTY: RED;\">asdf</div>"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
assert_match(/COLOR:\s*BLUE/i, sane.at_css("div")["style"])
refute_match(/NOTAPROPERTY/i, sane.at_css("div")["style"])
end
def test_many_properties_some_allowed
html = "<div style=\"background: bold notaproperty center alsonotaproperty 10px;\">asdf</div>"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
assert_match(/bold\s+center\s+10px/, sane.at_css("div")["style"])
end
def test_many_properties_non_allowed
html = "<div style=\"background: notaproperty alsonotaproperty;\">asdf</div>"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
assert_nil sane.at_css("div")["style"]
end
def test_svg_properties
html = "<line style='stroke-width: 10px;'></line>"
sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
assert_match(/stroke-width:\s*10px/, sane.at_css("line")["style"])
end
end
# <html5_license>
#
# Copyright (c) 2006-2008 The Authors
#
# Contributors:
# James Graham - jg307@cam.ac.uk
# Anne van Kesteren - annevankesteren@gmail.com
# Lachlan Hunt - lachlan.hunt@lachy.id.au
# Matt McDonald - kanashii@kanashii.ca
# Sam Ruby - rubys@intertwingly.net
# Ian Hickson (Google) - ian@hixie.ch
# Thomas Broyer - t.broyer@ltgt.net
# Jacques Distler - distler@golem.ph.utexas.edu
# Henri Sivonen - hsivonen@iki.fi
# The Mozilla Foundation (contributions from Henri Sivonen since 2008)
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# </html5_license>