Applied patch from Thijs van der Vossen to allow UTF-8 encoded

output when the encoding is UTF-8 and $KCODE is UTF8. git-svn-id: svn+ssh://rubyforge.org/var/svn/builder/trunk@109 b15df707-ad1a-0410-81b8-e991873a3486
drbrain · Oct 26, 2007 · 4352967 · 4352967
1 parent 3893715
commit 4352967
Show file tree

Hide file tree

Showing 7 changed files with 96 additions and 21 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,12 +1,16 @@
 = Change Log
 
+== Version 2.2.0
+
+* Applied patch from Thijs van der Vossen to allow UTF-8 encoded
+  output when the encoding is UTF-8 and $KCODE is UTF8.
+
 == Version 2.1.2
 
 * Fixed bug where private methods in kernel could leak through using
   tag!().  Thanks to Hagen Overdick for finding and diagnosing this
   bug.
 
-
 == Version 2.1.1
 
 * Fixed typo in XmlMarkup class docs (ident => indent). (from Martin

diff --git a/Rakefile b/Rakefile
@@ -21,7 +21,7 @@ end
 
 CLOBBER.include('pkg')
 
-CURRENT_VERSION = '2.1.2'
+CURRENT_VERSION = '2.2.0'
 PKG_VERSION = ENV['REL'] ? ENV['REL'] : CURRENT_VERSION
 
 SRC_RB = FileList['lib/**/*.rb']

diff --git a/lib/builder/xchar.rb b/lib/builder/xchar.rb
@@ -89,11 +89,13 @@ module XChar # :nodoc:
 class Fixnum
   XChar = Builder::XChar if ! defined?(XChar)
 
-  # XML escaped version of chr
-  def xchr
+  # XML escaped version of chr. When <tt>escape</tt> is set to false
+  # the CP1252 fix is still applied but utf-8 characters are not
+  # converted to character entities.
+  def xchr(escape=true)
     n = XChar::CP1252[self] || self
     case n when *XChar::VALID
-      XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
+      XChar::PREDEFINED[n] or (n<128 ? n.chr : (escape ? "&##{n};" : [n].pack('U*')))
     else
       '*'
     end
@@ -106,9 +108,11 @@ def xchr
 # to_s.
 #
 class String
-  # XML escaped version of to_s
-  def to_xs
-    unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
+  # XML escaped version of to_s. When <tt>escape</tt> is set to false
+  # the CP1252 fix is still applied but utf-8 characters are not
+  # converted to character entities.
+  def to_xs(escape=true)
+    unpack('U*').map {|n| n.xchr(escape)}.join # ASCII, UTF-8
   rescue
     unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
   end

diff --git a/lib/builder/xmlbase.rb b/lib/builder/xmlbase.rb
@@ -13,15 +13,18 @@ class XmlBase < BlankSlate
 
     # Create an XML markup builder.
     #
-    # out::     Object receiving the markup.  +out+ must respond to
-    #           <tt><<</tt>.
-    # indent::  Number of spaces used for indentation (0 implies no
-    #           indentation and no line breaks).
-    # initial:: Level of initial indentation.
-    #
-    def initialize(indent=0, initial=0)
+    # out::      Object receiving the markup.  +out+ must respond to
+    #            <tt><<</tt>.
+    # indent::   Number of spaces used for indentation (0 implies no
+    #            indentation and no line breaks).
+    # initial::  Level of initial indentation.
+    # encoding:: When <tt>encoding</tt> and $KCODE are set to 'utf-8'
+    #            characters aren't converted to character entities in
+    #            the output stream.
+    def initialize(indent=0, initial=0, encoding='utf-8')
       @indent = indent
       @level  = initial
+      @encoding = encoding.downcase
     end
 
     # Create a tag named +sym+.  Other than the first argument which
@@ -112,7 +115,7 @@ def nil?
 
     require 'builder/xchar'
     def _escape(text)
-      text.to_xs
+      text.to_xs((@encoding != 'utf-8' or $KCODE != 'UTF8'))
     end
 
     def _escape_quote(text)

diff --git a/lib/builder/xmlmarkup.rb b/lib/builder/xmlmarkup.rb
@@ -240,13 +240,14 @@ def instruct!(directive_tag=:xml, attrs={})
       if directive_tag == :xml
         a = { :version=>"1.0", :encoding=>"UTF-8" }
         attrs = a.merge attrs
+	@encoding = attrs[:encoding].downcase
       end
       _special(
-      "<?#{directive_tag}",
-      "?>",
-      nil,
-      attrs,
-      [:version, :encoding, :standalone])
+        "<?#{directive_tag}",
+        "?>",
+        nil,
+        attrs,
+        [:version, :encoding, :standalone])
     end
 
     # Insert a CDATA section into the XML markup.

diff --git a/test/test_markupbuilder.rb b/test/test_markupbuilder.rb
@@ -377,6 +377,62 @@ def test_initial_level
     assert_equal "        <name>\n          <first>Jim</first>\n        </name>\n", @xml.target!
   end
 
+  class TestUtfMarkup < Test::Unit::TestCase
+    def setup
+      @old_kcode = $KCODE
+    end
+
+    def teardown
+      $KCODE = @old_kcode
+    end
+
+    def test_use_entities_if_no_encoding_is_given_and_kcode_is_none
+      $KCODE = 'NONE'
+      xml = Builder::XmlMarkup.new
+      xml.p("\xE2\x80\x99")
+      assert_match(%r(<p>&#8217;</p>), xml.target!) #
+    end
+
+    def test_use_entities_if_encoding_is_utf_but_kcode_is_not
+      $KCODE = 'NONE'
+      xml = Builder::XmlMarkup.new
+      xml.instruct!(:xml, :encoding => 'UTF-8')
+      xml.p("\xE2\x80\x99")
+      assert_match(%r(<p>&#8217;</p>), xml.target!) #
+    end
+
+    def test_use_entities_if_kcode_is_utf_but_encoding_is_something_else
+      $KCODE = 'UTF8'
+      xml = Builder::XmlMarkup.new
+      xml.instruct!(:xml, :encoding => 'UTF-16')
+      xml.p("\xE2\x80\x99")
+      assert_match(%r(<p>&#8217;</p>), xml.target!) #
+    end
+
+    def test_use_utf8_if_encoding_defaults_and_kcode_is_utf8
+      $KCODE = 'UTF8'
+      xml = Builder::XmlMarkup.new
+      xml.p("\xE2\x80\x99")
+      assert_equal "<p>\xE2\x80\x99</p>", xml.target!
+    end
+
+    def test_use_utf8_if_both_encoding_and_kcode_are_utf8
+      $KCODE = 'UTF8'
+      xml = Builder::XmlMarkup.new
+      xml.instruct!(:xml, :encoding => 'UTF-8')
+      xml.p("\xE2\x80\x99")
+      assert_match(%r(<p>\xE2\x80\x99</p>), xml.target!)
+    end
+
+    def test_use_utf8_if_both_encoding_and_kcode_are_utf8_with_lowercase
+      $KCODE = 'UTF8'
+      xml = Builder::XmlMarkup.new
+      xml.instruct!(:xml, :encoding => 'utf-8')
+      xml.p("\xE2\x80\x99")
+      assert_match(%r(<p>\xE2\x80\x99</p>), xml.target!)
+    end
+  end
+
   class TestXmlEvents < Test::Unit::TestCase
     def setup
       @handler = EventHandler.new

diff --git a/test/test_xchar.rb b/test/test_xchar.rb
@@ -34,4 +34,11 @@ def test_utf8
     assert_equal '&#8217;', "\xE2\x80\x99".to_xs # right single quote
     assert_equal '&#169;',  "\xC2\xA9".to_xs     # copy
   end
+
+  def test_utf8_verbatim
+    assert_equal "\xE2\x80\x99", "\xE2\x80\x99".to_xs(false)  # right single quote
+    assert_equal "\xC2\xA9",  "\xC2\xA9".to_xs(false)         # copy
+    assert_equal "\xC2\xA9&amp;\xC2\xA9",
+      "\xC2\xA9&\xC2\xA9".to_xs(false)                        # copy with ampersand
+  end
 end