Skip to content

Commit

Permalink
Applied patch from Thijs van der Vossen to allow UTF-8 encoded
Browse files Browse the repository at this point in the history
output when the encoding is UTF-8 and $KCODE is UTF8.


git-svn-id: svn+ssh://rubyforge.org/var/svn/builder/trunk@109 b15df707-ad1a-0410-81b8-e991873a3486
  • Loading branch information
jimweirich committed Oct 26, 2007
1 parent 3893715 commit 4352967
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 21 deletions.
6 changes: 5 additions & 1 deletion CHANGES
@@ -1,12 +1,16 @@
= Change Log

== Version 2.2.0

* Applied patch from Thijs van der Vossen to allow UTF-8 encoded
output when the encoding is UTF-8 and $KCODE is UTF8.

== Version 2.1.2

* Fixed bug where private methods in kernel could leak through using
tag!(). Thanks to Hagen Overdick for finding and diagnosing this
bug.


== Version 2.1.1

* Fixed typo in XmlMarkup class docs (ident => indent). (from Martin
Expand Down
2 changes: 1 addition & 1 deletion Rakefile
Expand Up @@ -21,7 +21,7 @@ end

CLOBBER.include('pkg')

CURRENT_VERSION = '2.1.2'
CURRENT_VERSION = '2.2.0'
PKG_VERSION = ENV['REL'] ? ENV['REL'] : CURRENT_VERSION

SRC_RB = FileList['lib/**/*.rb']
Expand Down
16 changes: 10 additions & 6 deletions lib/builder/xchar.rb
Expand Up @@ -89,11 +89,13 @@ module XChar # :nodoc:
class Fixnum
XChar = Builder::XChar if ! defined?(XChar)

# XML escaped version of chr
def xchr
# XML escaped version of chr. When <tt>escape</tt> is set to false
# the CP1252 fix is still applied but utf-8 characters are not
# converted to character entities.
def xchr(escape=true)
n = XChar::CP1252[self] || self
case n when *XChar::VALID
XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
XChar::PREDEFINED[n] or (n<128 ? n.chr : (escape ? "&##{n};" : [n].pack('U*')))
else
'*'
end
Expand All @@ -106,9 +108,11 @@ def xchr
# to_s.
#
class String
# XML escaped version of to_s
def to_xs
unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
# XML escaped version of to_s. When <tt>escape</tt> is set to false
# the CP1252 fix is still applied but utf-8 characters are not
# converted to character entities.
def to_xs(escape=true)
unpack('U*').map {|n| n.xchr(escape)}.join # ASCII, UTF-8
rescue
unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
end
Expand Down
19 changes: 11 additions & 8 deletions lib/builder/xmlbase.rb
Expand Up @@ -13,15 +13,18 @@ class XmlBase < BlankSlate

# Create an XML markup builder.
#
# out:: Object receiving the markup. +out+ must respond to
# <tt><<</tt>.
# indent:: Number of spaces used for indentation (0 implies no
# indentation and no line breaks).
# initial:: Level of initial indentation.
#
def initialize(indent=0, initial=0)
# out:: Object receiving the markup. +out+ must respond to
# <tt><<</tt>.
# indent:: Number of spaces used for indentation (0 implies no
# indentation and no line breaks).
# initial:: Level of initial indentation.
# encoding:: When <tt>encoding</tt> and $KCODE are set to 'utf-8'
# characters aren't converted to character entities in
# the output stream.
def initialize(indent=0, initial=0, encoding='utf-8')
@indent = indent
@level = initial
@encoding = encoding.downcase
end

# Create a tag named +sym+. Other than the first argument which
Expand Down Expand Up @@ -112,7 +115,7 @@ def nil?

require 'builder/xchar'
def _escape(text)
text.to_xs
text.to_xs((@encoding != 'utf-8' or $KCODE != 'UTF8'))
end

def _escape_quote(text)
Expand Down
11 changes: 6 additions & 5 deletions lib/builder/xmlmarkup.rb
Expand Up @@ -240,13 +240,14 @@ def instruct!(directive_tag=:xml, attrs={})
if directive_tag == :xml
a = { :version=>"1.0", :encoding=>"UTF-8" }
attrs = a.merge attrs
@encoding = attrs[:encoding].downcase
end
_special(
"<?#{directive_tag}",
"?>",
nil,
attrs,
[:version, :encoding, :standalone])
"<?#{directive_tag}",
"?>",
nil,
attrs,
[:version, :encoding, :standalone])
end

# Insert a CDATA section into the XML markup.
Expand Down
56 changes: 56 additions & 0 deletions test/test_markupbuilder.rb
Expand Up @@ -377,6 +377,62 @@ def test_initial_level
assert_equal " <name>\n <first>Jim</first>\n </name>\n", @xml.target!
end

class TestUtfMarkup < Test::Unit::TestCase
def setup
@old_kcode = $KCODE
end

def teardown
$KCODE = @old_kcode
end

def test_use_entities_if_no_encoding_is_given_and_kcode_is_none
$KCODE = 'NONE'
xml = Builder::XmlMarkup.new
xml.p("\xE2\x80\x99")
assert_match(%r(<p>&#8217;</p>), xml.target!) #
end

def test_use_entities_if_encoding_is_utf_but_kcode_is_not
$KCODE = 'NONE'
xml = Builder::XmlMarkup.new
xml.instruct!(:xml, :encoding => 'UTF-8')
xml.p("\xE2\x80\x99")
assert_match(%r(<p>&#8217;</p>), xml.target!) #
end

def test_use_entities_if_kcode_is_utf_but_encoding_is_something_else
$KCODE = 'UTF8'
xml = Builder::XmlMarkup.new
xml.instruct!(:xml, :encoding => 'UTF-16')
xml.p("\xE2\x80\x99")
assert_match(%r(<p>&#8217;</p>), xml.target!) #
end

def test_use_utf8_if_encoding_defaults_and_kcode_is_utf8
$KCODE = 'UTF8'
xml = Builder::XmlMarkup.new
xml.p("\xE2\x80\x99")
assert_equal "<p>\xE2\x80\x99</p>", xml.target!
end

def test_use_utf8_if_both_encoding_and_kcode_are_utf8
$KCODE = 'UTF8'
xml = Builder::XmlMarkup.new
xml.instruct!(:xml, :encoding => 'UTF-8')
xml.p("\xE2\x80\x99")
assert_match(%r(<p>\xE2\x80\x99</p>), xml.target!)
end

def test_use_utf8_if_both_encoding_and_kcode_are_utf8_with_lowercase
$KCODE = 'UTF8'
xml = Builder::XmlMarkup.new
xml.instruct!(:xml, :encoding => 'utf-8')
xml.p("\xE2\x80\x99")
assert_match(%r(<p>\xE2\x80\x99</p>), xml.target!)
end
end

class TestXmlEvents < Test::Unit::TestCase
def setup
@handler = EventHandler.new
Expand Down
7 changes: 7 additions & 0 deletions test/test_xchar.rb
Expand Up @@ -34,4 +34,11 @@ def test_utf8
assert_equal '&#8217;', "\xE2\x80\x99".to_xs # right single quote
assert_equal '&#169;', "\xC2\xA9".to_xs # copy
end

def test_utf8_verbatim
assert_equal "\xE2\x80\x99", "\xE2\x80\x99".to_xs(false) # right single quote
assert_equal "\xC2\xA9", "\xC2\xA9".to_xs(false) # copy
assert_equal "\xC2\xA9&amp;\xC2\xA9",
"\xC2\xA9&\xC2\xA9".to_xs(false) # copy with ampersand
end
end

0 comments on commit 4352967

Please sign in to comment.