Merge pull request #76 from pali/master

Fix handling of undef, COW and magic scalar argument in Unicode.xs
dankogai · Nov 22, 2016 · 3728813 · 3728813
2 parents d915450 + e26cee2
commit 3728813
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 19 deletions.
diff --git a/Unicode/Unicode.xs b/Unicode/Unicode.xs
@@ -143,13 +143,38 @@ CODE:
     STRLEN ulen;
     STRLEN resultbuflen;
     U8 *resultbuf;
-    U8 *s = (U8 *)SvPVbyte(str,ulen);
-    U8 *e = (U8 *)SvEND(str);
+    U8 *s;
+    U8 *e;
+    bool modify = (check && !(check & ENCODE_LEAVE_SRC));
+    bool temp_result;
+
+    SvGETMAGIC(str);
+    if (!SvOK(str))
+        XSRETURN_UNDEF;
+    s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen);
+    if (SvUTF8(str)) {
+        if (!modify) {
+            SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
+            SvUTF8_on(tmp);
+            if (SvTAINTED(str))
+                SvTAINTED_on(tmp);
+            str = tmp;
+            s = (U8 *)SvPVX(str);
+        }
+        if (ulen) {
+            if (!utf8_to_bytes(s, &ulen))
+                croak("Wide character");
+            SvCUR_set(str, ulen);
+        }
+        SvUTF8_off(str);
+    }
+    e = s+ulen;
+
     /* Optimise for the common case of being called from PerlIOEncode_fill()
        with a standard length buffer. In this case the result SV's buffer is
        only used temporarily, so we can afford to allocate the maximum needed
        and not care about unused space. */
-    const bool temp_result = (ulen == PERLIO_BUFSIZ);
+    temp_result = (ulen == PERLIO_BUFSIZ);
 
     ST(0) = sv_2mortal(result);
     SvUTF8_on(result);
@@ -308,6 +333,7 @@ CODE:
 	    SvCUR_set(str,0);
 	}
 	*SvEND(str) = '\0';
+	SvSETMAGIC(str);
     }
 
     if (!temp_result) shrink_buffer(result);
@@ -328,13 +354,32 @@ CODE:
     const STRLEN usize = (size > 0 ? size : 1);
     SV *result = newSVpvn("", 0);
     STRLEN ulen;
-    U8 *s = (U8 *) SvPVutf8(utf8, ulen);
-    const U8 *e = (U8 *) SvEND(utf8);
+    U8 *s;
+    U8 *e;
+    bool modify = (check && !(check & ENCODE_LEAVE_SRC));
+    bool temp_result;
+
+    SvGETMAGIC(utf8);
+    if (!SvOK(utf8))
+        XSRETURN_UNDEF;
+    s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen);
+    if (!SvUTF8(utf8)) {
+        if (!modify) {
+            SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
+            if (SvTAINTED(utf8))
+                SvTAINTED_on(tmp);
+            utf8 = tmp;
+        }
+        sv_utf8_upgrade_nomg(utf8);
+        s = (U8 *)SvPV_nomg(utf8, ulen);
+    }
+    e = s+ulen;
+
     /* Optimise for the common case of being called from PerlIOEncode_flush()
        with a standard length buffer. In this case the result SV's buffer is
        only used temporarily, so we can afford to allocate the maximum needed
        and not care about unused space. */
-    const bool temp_result = (ulen == PERLIO_BUFSIZ);
+    temp_result = (ulen == PERLIO_BUFSIZ);
 
     ST(0) = sv_2mortal(result);
 
@@ -408,12 +453,11 @@ CODE:
 	    SvCUR_set(utf8,0);
 	}
 	*SvEND(utf8) = '\0';
+	SvSETMAGIC(utf8);
     }
 
     if (!temp_result) shrink_buffer(result);
     if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */
 
-    SvSETMAGIC(utf8);
-
     XSRETURN(1);
 }
diff --git a/t/magic.t b/t/magic.t
@@ -20,11 +20,12 @@ use warnings;
 
 use Encode qw(find_encoding encode decode encode_utf8 decode_utf8 is_utf8 _utf8_on _utf8_off FB_CROAK);
 
-use Test::More tests => 3*(2*(3*(4*4)+4)+4+3*3);
+use Test::More tests => 3*(2*(4*(4*4)+4)+4+3*3);
 
 my $ascii = find_encoding('ASCII');
 my $latin1 = find_encoding('Latin1');
 my $utf8 = find_encoding('UTF-8');
+my $utf16 = find_encoding('UTF-16LE');
 
 my $undef = undef;
 my $ascii_str = 'ascii_str';
@@ -34,44 +35,46 @@ _utf8_on($utf8_str);
 {
     foreach my $str ($undef, $ascii_str, $utf8_str) {
         foreach my $croak (0, 1) {
-            foreach my $enc ('ASCII', 'Latin1', 'UTF-8') {
+            foreach my $enc ('ASCII', 'Latin1', 'UTF-8', 'UTF-16LE') {
                 my $mod = defined $str && $croak;
                 my $func = "encode('" . $enc . "', " . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ($croak ? ', FB_CROAK' : '') . ')';
                 tie my $input, 'TieScalarCounter', $str;
                 my $output = encode($enc, $input, $croak ? FB_CROAK : 0);
                 is(tied($input)->{fetch}, 1, "$func processes get magic only once");
                 is(tied($input)->{store}, $mod ? 1 : 0, "$func " . ($mod ? 'processes set magic only once' : 'does not process set magic'));
                 is($input, $mod ? '' : $str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string');
-                is($output, $str, "$func returns correct \$output string");
+                is($output, ((defined $str and $enc eq 'UTF-16LE') ? encode("UTF-16LE", $str) : $str), "$func returns correct \$output string");
             }
-            foreach my $enc ('ASCII', 'Latin1', 'UTF-8') {
+            foreach my $enc ('ASCII', 'Latin1', 'UTF-8', 'UTF-16LE') {
                 my $mod = defined $str && $croak;
                 my $func = "decode('" . $enc . "', " . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ($croak ? ', FB_CROAK' : '') . ')';
-                tie my $input, 'TieScalarCounter', $str;
+                my $input_str = ((defined $str and $enc eq 'UTF-16LE') ? encode("UTF-16LE", $str) : $str);
+                tie my $input, 'TieScalarCounter', $input_str;
                 my $output = decode($enc, $input, $croak ? FB_CROAK : 0);
                 is(tied($input)->{fetch}, 1, "$func processes get magic only once");
                 is(tied($input)->{store}, $mod ? 1 : 0, "$func " . ($mod ? 'processes set magic only once' : 'does not process set magic'));
-                is($input, $mod ? '' : $str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string');
+                is($input, $mod ? '' : $input_str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string');
                 is($output, $str, "$func returns correct \$output string");
             }
-            foreach my $obj ($ascii, $latin1, $utf8) {
+            foreach my $obj ($ascii, $latin1, $utf8, $utf16) {
                 my $mod = defined $str && $croak;
                 my $func = '$' . $obj->name() . '->encode(' . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ($croak ? ', FB_CROAK' : '') . ')';
                 tie my $input, 'TieScalarCounter', $str;
                 my $output = $obj->encode($input, $croak ? FB_CROAK : 0);
                 is(tied($input)->{fetch}, 1, "$func processes get magic only once");
                 is(tied($input)->{store}, $mod ? 1 : 0, "$func " . ($mod ? 'processes set magic only once' : 'does not process set magic'));
                 is($input, $mod ? '' : $str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string');
-                is($output, $str, "$func returns correct \$output string");
+                is($output, ((defined $str and $obj == $utf16) ? encode("UTF-16LE", $str) : $str), "$func returns correct \$output string");
             }
-            foreach my $obj ($ascii, $latin1, $utf8) {
+            foreach my $obj ($ascii, $latin1, $utf8, $utf16) {
                 my $mod = defined $str && $croak;
                 my $func = '$' . $obj->name() . '->decode(' . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ($croak ? ', FB_CROAK' : '') . ')';
-                tie my $input, 'TieScalarCounter', $str;
+                my $input_str = ((defined $str and $obj == $utf16) ? encode("UTF-16LE", $str) : $str);
+                tie my $input, 'TieScalarCounter', $input_str;
                 my $output = $obj->decode($input, $croak ? FB_CROAK : 0);
                 is(tied($input)->{fetch}, 1, "$func processes get magic only once");
                 is(tied($input)->{store}, $mod ? 1 : 0, "$func " . ($mod ? 'processes set magic only once' : 'does not process set magic'));
-                is($input, $mod ? '' : $str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string');
+                is($input, $mod ? '' : $input_str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string');
                 is($output, $str, "$func returns correct \$output string");
             }
             {