Skip to content

Commit

Permalink
Item12444: HTML::Entities::_decode_entities assumes that the target s…
Browse files Browse the repository at this point in the history
…tring is unicode, and will expand all entities. However some entities are not representable by a single character in the default charset, so we have to reduce the set of converted entities to just those that can be converted. Also, the resulting string has the entities converted as unicode characters, which have to be converted to the site charset before further steps. Note that the translator is forced to work in the site charset because it has to use Foswiki::Func, which assumes it. If it could work in unicode, this would all be a lot simpler!

git-svn-id: http://svn.foswiki.org/branches/Release01x01@16736 0b4bb1d4-4e5a-0410-9cc4-b2b747904278
  • Loading branch information
CrawfordCurrie authored and CrawfordCurrie committed May 16, 2013
1 parent 7654513 commit 6cc5906
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 24 deletions.
3 changes: 2 additions & 1 deletion WysiwygPlugin/data/System/WysiwygPlugin.txt
Expand Up @@ -183,11 +183,12 @@ Many thanks to the following sponsors for supporting this work:

| Author: | [[http://c-dot.co.uk][Crawford Currie]], Foswiki Contributors |
| Copyright | © ILOG 2005 http://www.ilog.fr %BR% \
© 2008-2012 Foswiki Contributors |
© 2008-2013 Foswiki Contributors |
| License | [[http://www.gnu.org/licenses/gpl.html][GPL (Gnu General Public License)]] |
| Version: | %$VERSION% |
| Release: | %$RELEASE% |
| Change History: | |
| 1.1.16 (16 May 2013) | Foswikitask:Item12444: Fix problem with entities being expanded to unrepresentable characters |
| 1.1.15 (16 Dec 2012) | Foswikitask:Item12297: Minor perlcritic coding change |
| (21 Dec 2012) | Foswikitask:Item12278: Changing a wikiword should not require visiting the TinyMCE link dialog. |
| 1.1.14 (28 Nov 2012) | Foswikitask:Item11912: Clean up hex markers left behind by TinyMCEPlugin <br />\
Expand Down
4 changes: 2 additions & 2 deletions WysiwygPlugin/lib/Foswiki/Plugins/WysiwygPlugin.pm
Expand Up @@ -30,8 +30,8 @@ use Assert;
our $SHORTDESCRIPTION = 'Translator framework for WYSIWYG editors';
our $NO_PREFS_IN_TOPIC = 1;

use version; our $VERSION = version->declare("v1.1.15");
our $RELEASE = '1.1.15';
use version; our $VERSION = version->declare("v1.1.16");
our $RELEASE = '16 May 2013';

our %xmltag;

Expand Down
59 changes: 47 additions & 12 deletions WysiwygPlugin/lib/Foswiki/Plugins/WysiwygPlugin/Constants.pm
Expand Up @@ -5,6 +5,8 @@ use strict;
use warnings;

use Encode;
use HTML::Entities;
use Assert;

# HTML elements that are strictly block type, as defined by
# http://www.htmlhelp.com/reference/html40/block.html.
Expand Down Expand Up @@ -234,25 +236,23 @@ sub convertNotRepresentabletoEntity {
# characters representable in the site charset
$siteCharsetRepresentable = '';
for my $code ( 0 .. 255 ) {
my $unicodeChar =
Encode::decode( encoding(), chr($code), Encode::FB_PERLQQ );
if ( $unicodeChar =~ /^\\x/ ) {

# code is not valid, so skip it
}
else {
eval {
my $unicodeChar =
Encode::decode( encoding(), chr($code),
Encode::FB_CROAK );

# Escape codes in the standard ASCII range, as necessary,
# to avoid special interpretation by perl
$unicodeChar = quotemeta($unicodeChar)
if ord($unicodeChar) <= 127;

$siteCharsetRepresentable .= $unicodeChar;
}
};

# otherwise ignore
}
}

require HTML::Entities;
$_[0] =
HTML::Entities::encode_entities( $_[0],
"^$siteCharsetRepresentable" );
Expand Down Expand Up @@ -280,23 +280,58 @@ our @safeEntities = qw(
oslash ugrave uacute ucirc uuml yacute thorn yuml
);

# Mapping from entity names to characters
# Get a hash that maps the safe entities values to unicode characters
our $safe_entities;

# Get a hash that maps the safe entities values to unicode characters
sub safeEntities {
unless ($safe_entities) {
foreach my $entity (@safeEntities) {

# Decode the entity name to unicode
my $unicode = HTML::Entities::decode_entities("&$entity;");

$safe_entities->{"$entity"} = $unicode;
$safe_entities->{$entity} = $unicode;
}
}
return $safe_entities;
}

# Given a string encoded using {Site}{CharSet}, decode all entities in
# it that can be mapped to the encoding, and return a string encoded
# used the {Site}{CharSet}
our $representable_entities;

sub decodeRepresentableEntities {
if ( !$representable_entities ) {
if ( encoding() =~ /^utf-?8/ ) {

# UTF-8 can do all entities
$representable_entities = \%HTML::Entities::entity2char;
}
else {

# Filter the entity set to those that can be
# represented in the site charset
while ( my ( $entity, $unicode ) =
each %HTML::Entities::entity2char )
{
eval {
my $uncool = $unicode;
Encode::encode( encoding(), $uncool, Encode::FB_CROAK );
};
unless ($@) {

# $unicode can be encoded in the site charset
$representable_entities->{$entity} = $unicode;
}
}
}
}
HTML::Entities::_decode_entities( $_[0], $representable_entities );
$_[0] = Encode::encode( encoding(), $_[0] );
return $_[0];
}

# Debug
sub chCodes {
my $text = shift;
Expand Down
4 changes: 2 additions & 2 deletions WysiwygPlugin/lib/Foswiki/Plugins/WysiwygPlugin/HTML2TML.pm
Expand Up @@ -148,7 +148,6 @@ sub convert {
$text =~ s/\&\#x22;/\&quot;/goi;
$text =~ s/\&\#160;/\&nbsp;/goi;

require HTML::Entities;
HTML::Entities::_decode_entities( $text, WC::safeEntities() );

#print STDERR "decodedent[". debugEncode($text). "]\n\n";
Expand All @@ -163,7 +162,8 @@ sub convert {
#print STDERR "notrep2ent[". debugEncode($text). "]\n\n";

# $text is now Unicode characters that are representable
# in the site charset. Convert to the site charset:
# in the site charset.
# Convert to the site charset:
if ( WC::encoding() =~ /^utf-?8/ ) {

# nothing to do, already in unicode
Expand Down
13 changes: 8 additions & 5 deletions WysiwygPlugin/lib/Foswiki/Plugins/WysiwygPlugin/HTML2TML/Node.pm
Expand Up @@ -31,6 +31,7 @@ use vars qw( $reww );

use Foswiki::Plugins::WysiwygPlugin::Constants;
use Foswiki::Plugins::WysiwygPlugin::HTML2TML::WC;
use HTML::Entities ();

my %jqueryChiliClass = map { $_ => 1 }
qw( cplusplus csharp css bash delphi html java js
Expand Down Expand Up @@ -647,8 +648,10 @@ sub _flatten {
$text =~ s/[$WC::PON$WC::POFF]//g;

unless ( $options & $WC::KEEP_ENTITIES ) {
require HTML::Entities;
$text = HTML::Entities::decode_entities($text);

# This will decode only those entities that
# have a representation in the site charset.
WC::decodeRepresentableEntities($text);

# &nbsp; decodes to \240, which we want to make a space.
$text =~ s/\240/$WC::NBSP/g;
Expand Down Expand Up @@ -1325,9 +1328,9 @@ sub _verbatim {
$options |= $WC::PROTECTED | $WC::KEEP_ENTITIES | $WC::BR2NL | $WC::KEEP_WS;
my ( $flags, $text ) = $this->_flatten($options);

# decode once, and once only
require HTML::Entities;
$text = HTML::Entities::decode_entities($text);
# decode once, and once only. This will decode only those
# entities than have a representation in the site charset.
WC::decodeRepresentableEntities($text);

# &nbsp; decodes to \240, which we want to make a space.
$text =~ s/\240/$WC::NBSP/g;
Expand Down
4 changes: 2 additions & 2 deletions WysiwygPlugin/test/unit/WysiwygPlugin/TranslatorTests.pm
Expand Up @@ -187,10 +187,10 @@ BLAH
name => 'codeToFromHtml',
html => <<'BLAH',
<p>
<span class="WYSIWYG_TT">Code</span>
<span class="WYSIWYG_TT">&Alpha; Code</span>
</p>
BLAH
tml => '=Code='
tml => '=&Alpha; Code='
},
{
exec => $ROUNDTRIP,
Expand Down

0 comments on commit 6cc5906

Please sign in to comment.