Skip to content

Commit

Permalink
Fix HTML/XML parser to correctly, case sensitively parse XML
Browse files Browse the repository at this point in the history
The previous version of the parser converted all tags and attribute names
to lowercase. However, this is only valid for HTML since HTML tags are
case insensitive, XML tags and attribute namesare case sensitive.

Fixes #310
  • Loading branch information
gettalong committed May 1, 2016
1 parent c972cc2 commit 692c049
Show file tree
Hide file tree
Showing 11 changed files with 55 additions and 23 deletions.
23 changes: 17 additions & 6 deletions lib/kramdown/parser/html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ module Constants
figcaption footer form h1 h2 h3 h4 h5 h6 header hgroup hr html head iframe legend menu
li map nav ol optgroup p pre section summary table tbody td th thead tfoot tr ul}
HTML_ELEMENTS_WITHOUT_BODY = %w{area base br col command embed hr img input keygen link meta param source track wbr}

HTML_ELEMENT = Hash.new(false)
(HTML_SPAN_ELEMENTS + HTML_BLOCK_ELEMENTS + HTML_ELEMENTS_WITHOUT_BODY +
HTML_CONTENT_MODEL.keys).each do |a|
HTML_ELEMENT[a] = true
end
end


Expand All @@ -77,9 +83,10 @@ module Parser
# element is already closed, ie. contains no body; the third parameter specifies whether the
# body - and the end tag - need to be handled in case closed=false).
def handle_html_start_tag(line = nil) # :yields: el, closed, handle_body
name = @src[1].downcase
name = @src[1]
name.downcase! if HTML_ELEMENT[name.downcase]
closed = !@src[4].nil?
attrs = parse_html_attributes(@src[2], line)
attrs = parse_html_attributes(@src[2], line, HTML_ELEMENT[name])

el = Element.new(:html_element, name, attrs, :category => :block)
el.options[:location] = line if line
Expand All @@ -99,10 +106,13 @@ def handle_html_start_tag(line = nil) # :yields: el, closed, handle_body
# Parses the given string for HTML attributes and returns the resulting hash.
#
# If the optional +line+ parameter is supplied, it is used in warning messages.
def parse_html_attributes(str, line = nil)
#
# If the optional +in_html_tag+ parameter is set to +false+, attributes are not modified to
# contain only lowercase letters.
def parse_html_attributes(str, line = nil, in_html_tag = true)
attrs = Utils::OrderedHash.new
str.scan(HTML_ATTRIBUTE_RE).each do |attr, sep, val|
attr.downcase!
attr.downcase! if in_html_tag
if attrs.has_key?(attr)
warning("Duplicate HTML attribute '#{attr}' on line #{line || '?'} - overwriting previous one")
end
Expand Down Expand Up @@ -155,10 +165,11 @@ def parse_raw_html(el, &block)
handle_html_start_tag(&block) # DEPRECATED: method needs to accept line number in 2.0
end
elsif @src.scan(HTML_TAG_CLOSE_RE)
if @tree.value == @src[1].downcase
if @tree.value == (HTML_ELEMENT[@tree.value] ? @src[1].downcase : @src[1])
done = true
else
warning("Found invalidly used HTML closing tag for '#{@src[1].downcase}' on line #{line} - ignoring it")
add_text(@src.matched, @tree, :text)
warning("Found invalidly used HTML closing tag for '#{@src[1]}' on line #{line} - ignoring it")
end
else
add_text(@src.getch, @tree, :text)
Expand Down
8 changes: 5 additions & 3 deletions lib/kramdown/parser/kramdown/html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,15 @@ def parse_span_html
warning("Found invalidly used HTML closing tag for '#{@src[1]}' on line #{line}")
add_text(result)
elsif result = @src.scan(HTML_TAG_RE)
tag_name = @src[1].downcase
tag_name = @src[1]
tag_name.downcase! if HTML_ELEMENT[tag_name.downcase]
if HTML_BLOCK_ELEMENTS.include?(tag_name)
warning("Found block HTML tag '#{tag_name}' in span-level text on line #{line}")
add_text(result)
return
end

attrs = parse_html_attributes(@src[2], line)
attrs = parse_html_attributes(@src[2], line, HTML_ELEMENT[tag_name])
attrs.each {|name, value| value.gsub!(/\n+/, ' ')}

do_parsing = (HTML_CONTENT_MODEL[tag_name] == :raw || @tree.options[:content_model] == :raw ? false : @options[:parse_span_html])
Expand All @@ -139,7 +140,8 @@ def parse_span_html
el = Element.new(:html_element, tag_name, attrs, :category => :span, :location => line,
:content_model => (do_parsing ? :span : :raw), :is_closed => !!@src[4])
@tree.children << el
stop_re = /<\/#{Regexp.escape(tag_name)}\s*>/i
stop_re = /<\/#{Regexp.escape(tag_name)}\s*>/
stop_re = Regexp.new(stop_re.source, Regexp::IGNORECASE) if HTML_ELEMENT[tag_name]
if !@src[4] && !HTML_ELEMENTS_WITHOUT_BODY.include?(el.value)
if parse_spans(el, stop_re, (do_parsing ? nil : [:span_html]))
@src.scan(stop_re)
Expand Down
6 changes: 6 additions & 0 deletions test/test_files.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class TestFiles < Minitest::Test
'test/testcases/block/15_math/mathjax_preview.html', # bc of mathjax preview
'test/testcases/block/15_math/mathjax_preview_simple.html', # bc of mathjax preview
'test/testcases/span/05_html/mark_element.html', # bc of tidy
'test/testcases/block/09_html/xml.html', # bc of tidy
'test/testcases/span/05_html/xml.html', # bc of tidy
].compact
EXCLUDE_HTML_TEXT_FILES = ['test/testcases/block/09_html/parse_as_span.htmlinput',
'test/testcases/block/09_html/parse_as_raw.htmlinput',
Expand Down Expand Up @@ -169,6 +171,8 @@ def tidy_output(out)
'test/testcases/span/math/mathjaxnode.text', # bc of tidy
'test/testcases/span/01_link/link_defs_with_ial.text', # bc of attribute ordering
'test/testcases/span/05_html/mark_element.text', # bc of tidy
'test/testcases/block/09_html/xml.text', # bc of tidy
'test/testcases/span/05_html/xml.text', # bc of tidy
].compact
Dir[File.dirname(__FILE__) + '/testcases/**/*.text'].each do |text_file|
next if EXCLUDE_TEXT_FILES.any? {|f| text_file =~ /#{f}$/}
Expand Down Expand Up @@ -219,6 +223,8 @@ def tidy_output(out)
'test/testcases/block/15_math/mathjax_preview_simple.html', # bc of mathjax preview
'test/testcases/span/01_link/link_defs_with_ial.html', # bc of attribute ordering
'test/testcases/span/05_html/mark_element.html', # bc of tidy
'test/testcases/block/09_html/xml.html', # bc of tidy
'test/testcases/span/05_html/xml.html', # bc of tidy
].compact
Dir[File.dirname(__FILE__) + '/testcases/**/*.{html,html.19}'].each do |html_file|
next if EXCLUDE_HTML_KD_FILES.any? {|f| html_file =~ /#{f}(\.19)?$/}
Expand Down
2 changes: 1 addition & 1 deletion test/testcases/block/09_html/not_parsed.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
</div></div>

<div>

&lt;/p&gt;
</div>

<div>
Expand Down
4 changes: 0 additions & 4 deletions test/testcases/block/09_html/simple.html
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@
<p>Another para.</p>
</div>

<webgen:block name="test" />

<some:url name:spac="hallo">doit</some:url>

<p><em>Test</em></p>

<p><em>Test</em></p>
Expand Down
4 changes: 0 additions & 4 deletions test/testcases/block/09_html/simple.html.19
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@ weiter
<p>Another para.</p>
</div>

<webgen:block name="test" />

<some:url name:spac="hallo">doit</some:url>

<p><em>Test</em></p>

<p><em>Test</em></p>
Expand Down
6 changes: 1 addition & 5 deletions test/testcases/block/09_html/simple.text
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,10 @@ hallo

para6

<div><div cLASs="clear"></div>
<div><DiV cLASs="clear"></dIv>
Another para.
</div>

<webgen:block name="test" />

<some:url name:spac='hallo'>doit</some:url>

<em>Test</em>

<p><em>Test</em></p>
Expand Down
8 changes: 8 additions & 0 deletions test/testcases/block/09_html/xml.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<webgen:block name="test" />

<some:url name:spac="hallo">doit</some:url>

<SoMe:UrL NamE:SpAC="test">doit</SoMe:UrL>

<SoMe>doit&lt;/some&gt;
</SoMe>
7 changes: 7 additions & 0 deletions test/testcases/block/09_html/xml.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<webgen:block name="test" />

<some:url name:spac='hallo'>doit</some:url>

<SoMe:UrL NamE:SpAC='test'>doit</SoMe:UrL>

<SoMe>doit</some>
5 changes: 5 additions & 0 deletions test/testcases/span/05_html/xml.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<p>This <some:url name:spac="hallo">doit</some:url> test</p>

<p>This <SoMe:UrL NamE:SpAC="test">doit</SoMe:UrL> test</p>

<p>This <SoMe>doit&lt;/some&gt; test</SoMe></p>
5 changes: 5 additions & 0 deletions test/testcases/span/05_html/xml.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
This <some:url name:spac='hallo'>doit</some:url> test

This <SoMe:UrL NamE:SpAC='test'>doit</SoMe:UrL> test

This <SoMe>doit</some> test

0 comments on commit 692c049

Please sign in to comment.