Permalink
Browse files

Add support for UTS #46, along the lines of Zend, to idn_to_ascii.

Removes the output errorCode parameter (that is not currently used anywhere, except in testing code that gets but ignores it). Adds the options, variant and
idna_info parameters that are present in the Zend version of idn_to_ascii. Implement the new functionality required by these parameters by using the UTS #46 API, if
present.
  • Loading branch information...
1 parent e663cfb commit b74a0da0623d72ac0d5dfc097ae307653b0e7f35 @hermanventer hermanventer committed with sgolemon Feb 12, 2013
@@ -90,6 +90,71 @@ be located by the compiled program at runtime, perhaps with ldconfig or any
other ways.
+<h2>How to modify existing non-separable extension functions/classes</h2>
+
+All commands and directories are from src/ in this guide.
+
+1. Update IDL
+
+Change the function/class prototypes in the corresponding IDL file.
+Follow existing ones for formats. Most of them are straightforward.
+Possible types are listed in idl/base.php.
+
+NOTE regarding the "NeedsActRec" flag:
+
+A function that examines the state of the current or previous
+frames in its call stack must have the "NeedsActRec" flag set
+in its "flags" field. Examples of use of the frame state include
+examining the arguments of the function, looking up the invocation
+context for the function (e.g., for permissions check), etc.
+
+2. Update headers and implementation.
+
+Edit runtime/ext/ext_[name].h and runtime/ext/ext_[name].cpp to
+reflect the desired modifications in signature and behavior.
+
+3. Auto-generate files from IDL
+
+Run
+
+ EXT=[name] make -C idl update
+
+For example,
+
+ EXT=string make -C idl update
+
+This will modify several files with the new signatures.
+
+4. Unit tests
+
+The "EXT=name make -C idl update" does the following for test files:
+ creates test/test_ext_[name].h -- entry points for this unit test, don't modify
+ updates test/test_ext.inc -- to invoke the test, don't modify
+ creates idl/test_ext_[name].cpp -- A template to be modified.
+
+You can probably delete these and modify the existing unit tests.
+
+5. Compile the php to c compiler (so that we can use it to bootstrap hhvm)
+
+ make -j (if you compile using make) (unset USE_HHVM)
+ or
+ fbconfig -r hphp (if you haven't done so already)
+ fbmake --fast dbg -j (if you compile using fbmake)
+
+6. Use the new php compiler to generate the marshalling code
+
+ make -C system (if you compile using make)
+
+ or
+
+ ./tools/remake_system.sh (if you use fbmake)
+
+7. Compile hhvm
+
+ make -j (if you compile using make) (export USE_HHVM=1)
+ or
+ fbmake --fast dbg -j (if you compile using fbmake)
+
<h2>How to add new non-separable extension functions/classes</h2>
All commands and directories are from src/ in this guide.
@@ -162,14 +227,13 @@ unit test function like this,
void TestExt[Name]::test_ext_[function] () { ... }
-6. How to modify existing extension functions/classes
-Modify the IDL file. Then run
+6. After compiling once, do this to update the compiler with the latest functions:
+
+ make -C system (if you compile using make)
- EXT=existing make -C idl update
+ or
-Follow all steps after step 3 at above.
+ ./tools/remake_system.sh (if you use fbmake)
-7. After compiling once, do this to update compiler with the latest functions:
- make -C system
View
@@ -390,10 +390,22 @@
'desc' => "Domain to convert. In PHP 5 must be UTF-8 encoded.",
),
array(
- 'name' => "errorcode",
+ 'name' => "options",
+ 'type' => Int64,
+ 'value' => "0",
+ 'desc' => "Conversion options - combination of IDNA_* constants (except IDNA_ERROR_* constants). ",
+ ),
+ array(
+ 'name' => "variant",
+ 'type' => Int64,
+ 'value' => "0",
+ 'desc' => "Either INTL_IDNA_VARIANT_2003 for IDNA 2003 or INTL_IDNA_VARIANT_UTS46 for UTS #46.",
+ ),
+ array(
+ 'name' => "idna_info",
'type' => Variant | Reference,
'value' => "null",
- 'desc' => "Conversion options - combination of IDNA_* constants.",
+ 'desc' => "This parameter can be used only if INTL_IDNA_VARIANT_UTS46 was used for variant. In that case, it will be filled with an array with the keys 'result', the possibly illegal result of the transformation, 'isTransitionalDifferent', a boolean indicating whether the usage of the transitional mechanisms of UTS #46 either has or would have changed the result and 'errors', which is an int representing a bitset of the error constants IDNA_ERROR_*. ",
),
),
'taint_observer' => array(
@@ -415,9 +427,22 @@
'type' => String,
),
array(
- 'name' => "errorcode",
+ 'name' => "options",
+ 'type' => Int64,
+ 'value' => "0",
+ 'desc' => "Conversion options - combination of IDNA_* constants (except IDNA_ERROR_* constants). ",
+ ),
+ array(
+ 'name' => "variant",
+ 'type' => Int64,
+ 'value' => "0",
+ 'desc' => "Either INTL_IDNA_VARIANT_2003 for IDNA 2003 or INTL_IDNA_VARIANT_UTS46 for UTS #46.",
+ ),
+ array(
+ 'name' => "idna_info",
'type' => Variant | Reference,
'value' => "null",
+ 'desc' => "This parameter can be used only if INTL_IDNA_VARIANT_UTS46 was used for variant. In that case, it will be filled with an array with the keys 'result', the possibly illegal result of the transformation, 'isTransitionalDifferent', a boolean indicating whether the usage of the transitional mechanisms of UTS #46 either has or would have changed the result and 'errors', which is an int representing a bitset of the error constants IDNA_ERROR_*. ",
),
),
'taint_observer' => array(
@@ -442,10 +467,22 @@
'desc' => "Domain to convert in IDNA ASCII-compatible format.",
),
array(
- 'name' => "errorcode",
+ 'name' => "options",
+ 'type' => Int64,
+ 'value' => "0",
+ 'desc' => "Conversion options - combination of IDNA_* constants (except IDNA_ERROR_* constants). ",
+ ),
+ array(
+ 'name' => "variant",
+ 'type' => Int64,
+ 'value' => "0",
+ 'desc' => "Either INTL_IDNA_VARIANT_2003 for IDNA 2003 or INTL_IDNA_VARIANT_UTS46 for UTS #46.",
+ ),
+ array(
+ 'name' => "idna_info",
'type' => Variant | Reference,
'value' => "null",
- 'desc' => "Conversion options - combination of IDNA_* constants.",
+ 'desc' => "This parameter can be used only if INTL_IDNA_VARIANT_UTS46 was used for variant. In that case, it will be filled with an array with the keys 'result', the possibly illegal result of the transformation, 'isTransitionalDifferent', a boolean indicating whether the usage of the transitional mechanisms of UTS #46 either has or would have changed the result and 'errors', which is an int representing a bitset of the error constants IDNA_ERROR_*. ",
),
),
'taint_observer' => array(
@@ -825,7 +862,7 @@
DefineFunction(
array(
'name' => "setstrength",
- 'desc' => "Procedural style bool collator_set_strength ( Collator \$coll , int \$strength ) The » ICU Collation Service supports many levels of comparison (named \"Levels\", but also known as \"Strengths\"). Having these categories enables ICU to sort strings precisely according to local conventions. However, by allowing the levels to be selectively employed, searching for a string in text can be performed with various matching conditions.\n\n\n\nPrimary Level: Typically, this is used to denote differences between base characters (for example, \"a\" < \"b\"). It is the strongest difference. For example, dictionaries are divided into different sections by base character. This is also called the level1 strength.\n\nSecondary Level: Accents in the characters are considered secondary differences (for example, \"as\" < \"às\" < \"at\"). Other differences between letters can also be considered secondary differences, depending on the language. A secondary difference is ignored when there is a primary difference anywhere in the strings. This is also called the level2 strength.\n\nNote: In some languages (such as Danish), certain accented letters are considered to be separate base characters. In most languages, however, an accented letter only has a secondary difference from the unaccented version of that letter.\n\nTertiary Level: Upper and lower case differences in characters are distinguished at the tertiary level (for example, \"ao\" < \"Ao\" < \"aò\"). In addition, a variant of a letter differs from the base form on the tertiary level (such as \"A\" and \" \"). Another example is the difference between large and small Kana. A tertiary difference is ignored when there is a primary or secondary difference anywhere in the strings. This is also called the level3 strength.\n\nQuaternary Level: When punctuation is ignored (see Ignoring Punctuations ) at level 13, an additional level can be used to distinguish words with and without punctuation (for example, \"ab\" < \"a-b\" < \"aB\"). This difference is ignored when there is a primary, secondary or tertiary difference. This is also known as the level4 strength. The quaternary level should only be used if ignoring punctuation is required or when processing Japanese text (see Hiragana processing).\n\nIdentical Level: When all other levels are equal, the identical level is used as a tiebreaker. The Unicode code point values of the NFD form of each string are compared at this level, just in case there is no difference at levels 14. For example, Hebrew cantillation marks are only distinguished at this level. This level should be used sparingly, as only code point values differences between two strings is an extremely rare occurrence. Using this level substantially decreases the performance for both incremental comparison and sort key generation (as well as increasing the sort key length). It is also known as level 5 strength.\n\nFor example, people may choose to ignore accents or ignore accents and case when searching for text. Almost all characters are distinguished by the first three levels, and in most locales the default value is thus Tertiary. However, if Alternate is set to be Shifted, then the Quaternary strength can be used to break ties among whitespace, punctuation, and symbols that would otherwise be ignored. If very fine distinctions among characters are required, then the Identical strength can be used (for example, Identical Strength distinguishes between the Mathematical Bold Small A and the Mathematical Italic Small A.). However, using levels higher than Tertiary the Identical strength result in significantly longer sort keys, and slower string comparison performance for equal strings.",
+ 'desc' => "Procedural style bool collator_set_strength ( Collator \$coll , int \$strength ) The ICU Collation Service supports many levels of comparison (named \"Levels\", but also known as \"Strengths\"). Having these categories enables ICU to sort strings precisely according to local conventions. However, by allowing the levels to be selectively employed, searching for a string in text can be performed with various matching conditions.\n\n\n\nPrimary Level: Typically, this is used to denote differences between base characters (for example, \"a\" < \"b\"). It is the strongest difference. For example, dictionaries are divided into different sections by base character. This is also called the level1 strength.\n\nSecondary Level: Accents in the characters are considered secondary differences. Other differences between letters can also be considered secondary differences, depending on the language. A secondary difference is ignored when there is a primary difference anywhere in the strings. This is also called the level2 strength.\n\nNote: In some languages (such as Danish), certain accented letters are considered to be separate base characters. In most languages, however, an accented letter only has a secondary difference from the unaccented version of that letter.\n\nTertiary Level: Upper and lower case differences in characters are distinguished at the tertiary level. In addition, a variant of a letter differs from the base form on the tertiary level (such as \"A\" and \" \"). Another example is the difference between large and small Kana. A tertiary difference is ignored when there is a primary or secondary difference anywhere in the strings. This is also called the level3 strength.\n\nQuaternary Level: When punctuation is ignored (see Ignoring Punctuations ) at level 13, an additional level can be used to distinguish words with and without punctuation (for example, \"ab\" < \"a-b\" < \"aB\"). This difference is ignored when there is a primary, secondary or tertiary difference. This is also known as the level4 strength. The quaternary level should only be used if ignoring punctuation is required or when processing Japanese text (see Hiragana processing).\n\nIdentical Level: When all other levels are equal, the identical level is used as a tiebreaker. The Unicode code point values of the NFD form of each string are compared at this level, just in case there is no difference at levels 14. For example, Hebrew cantillation marks are only distinguished at this level. This level should be used sparingly, as only code point values differences between two strings is an extremely rare occurrence. Using this level substantially decreases the performance for both incremental comparison and sort key generation (as well as increasing the sort key length). It is also known as level 5 strength.\n\nFor example, people may choose to ignore accents or ignore accents and case when searching for text. Almost all characters are distinguished by the first three levels, and in most locales the default value is thus Tertiary. However, if Alternate is set to be Shifted, then the Quaternary strength can be used to break ties among whitespace, punctuation, and symbols that would otherwise be ignored. If very fine distinctions among characters are required, then the Identical strength can be used (for example, Identical Strength distinguishes between the Mathematical Bold Small A and the Mathematical Italic Small A.). However, using levels higher than Tertiary the Identical strength result in significantly longer sort keys, and slower string comparison performance for equal strings.",
'flags' => HasDocComment,
'return' => array(
'type' => Boolean,
@@ -31,6 +31,10 @@
#include <system/lib/systemlib.h>
+#ifdef UIDNA_INFO_INITIALIZER
+#define HAVE_46_API 1 /* has UTS#46 API (introduced in ICU 4.6) */
+#endif
+
namespace HPHP {
IMPLEMENT_DEFAULT_EXTENSION(idn);
///////////////////////////////////////////////////////////////////////////////
@@ -733,13 +737,66 @@ Variant c_Normalizer::ti_normalize(const char* cls , CStrRef input,
///////////////////////////////////////////////////////////////////////////////
+enum IdnVariant {
+ INTL_IDN_VARIANT_2003 = 0,
+ INTL_IDN_VARIANT_UTS46
+};
+
enum {
INTL_IDN_TO_ASCII = 0,
INTL_IDN_TO_UTF8
};
-static Variant php_intl_idn_to(CStrRef domain, VRefParam errorcode, int mode) {
- long option = 0;
+#ifdef HAVE_46_API
+static Variant php_intl_idn_to_46(CStrRef domain, int64 options, IdnVariant idn_variant, VRefParam idna_info, int mode) {
+ int32_t converted_capacity;
+ char *converted = NULL;
+ int32_t converted_len;
+ UIDNA *uts46;
+ UIDNAInfo info = UIDNA_INFO_INITIALIZER;
+ UErrorCode status;
+
+ // Get UIDNA instance which implements UTS #46.
+ uts46 = uidna_openUTS46(options, &status);
+ SCOPE_EXIT { uidna_close(uts46); };
+ if (U_FAILURE(status)) return false;
+
+ // Call the appropriate IDN function
+ status = U_ZERO_ERROR;
+ converted_capacity = 255; // no domain name may exceed this
+ String result(converted_capacity, ReserveString); // reserves converted_capacity+1 characters.
+ converted = result.mutableSlice().ptr;
+ if (mode == INTL_IDN_TO_ASCII) {
+ converted_len = uidna_nameToASCII_UTF8(uts46, (char*)domain.data(), domain.size(),
+ converted, converted_capacity, &info, &status);
+ } else {
+ converted_len = uidna_nameToUnicodeUTF8(uts46, (char*)domain.data(), domain.size(),
+ converted, converted_capacity, &info, &status);
+ }
+ if (U_FAILURE(status) || converted_len > converted_capacity) return false;
+ if (info.errors == 0) {
+ result.setSize(converted_len);
+ } else {
+ result.setSize(0);
+ }
+
+ // Set up the array returned in idna_info.
+ Array arr;
+ arr.set("result", result);
+ arr.set("isTransitionalDifferent", info.isTransitionalDifferent);
+ arr.set("errors", (long)info.errors);
+ idna_info = arr; // As in Zend, the previous value of idn_variant is overwritten, not modified.
+
+ if (info.errors == 0) {
+ return result;
+ } else {
+ return false;
+ }
+}
+
+#endif
+
+static Variant php_intl_idn_to(CStrRef domain, int64 options, IdnVariant idn_variant, VRefParam idna_info, int mode) {
UChar* ustring = NULL;
int ustring_len = 0;
UErrorCode status;
@@ -748,13 +805,21 @@ static Variant php_intl_idn_to(CStrRef domain, VRefParam errorcode, int mode) {
UChar* converted = NULL;
int32_t converted_ret_len;
+ if (idn_variant != INTL_IDN_VARIANT_2003) {
+#ifdef HAVE_46_API
+ if (idn_variant == INTL_IDN_VARIANT_UTS46) {
+ return php_intl_idn_to_46(domain, options, idn_variant, ref(idna_info), mode);
+ }
+#endif
+ return false;
+ }
+
// Convert the string to UTF-16
status = U_ZERO_ERROR;
intl_convert_utf8_to_utf16(&ustring, &ustring_len,
(char*)domain.data(), domain.size(), &status);
if (U_FAILURE(status)) {
free(ustring);
- errorcode = status;
return false;
}
@@ -767,17 +832,16 @@ static Variant php_intl_idn_to(CStrRef domain, VRefParam errorcode, int mode) {
// If the malloc failed, bail out
if (!converted) {
free(ustring);
- errorcode = U_MEMORY_ALLOCATION_ERROR;
return false;
}
if (mode == INTL_IDN_TO_ASCII) {
converted_ret_len = uidna_IDNToASCII(ustring,
ustring_len, converted, converted_len,
- (int32_t)option, &parse_error, &status);
+ (int32_t)options, &parse_error, &status);
} else {
converted_ret_len = uidna_IDNToUnicode(ustring,
ustring_len, converted, converted_len,
- (int32_t)option, &parse_error, &status);
+ (int32_t)options, &parse_error, &status);
}
if (status != U_BUFFER_OVERFLOW_ERROR)
break;
@@ -789,7 +853,6 @@ static Variant php_intl_idn_to(CStrRef domain, VRefParam errorcode, int mode) {
free(ustring);
if (U_FAILURE(status)) {
free(converted);
- errorcode = status;
return false;
}
@@ -800,24 +863,23 @@ static Variant php_intl_idn_to(CStrRef domain, VRefParam errorcode, int mode) {
free(converted);
if (U_FAILURE(status)) {
free(converted_utf8);
- errorcode = status;
return false;
}
// Return the string
return String(converted_utf8, converted_utf8_len, AttachString);
}
-Variant f_idn_to_ascii(CStrRef domain, VRefParam errorcode /* = null */) {
- return php_intl_idn_to(domain, ref(errorcode), INTL_IDN_TO_ASCII);
+Variant f_idn_to_ascii(CStrRef domain, int64 options /* = 0 */, int64 variant /* = 0 */, VRefParam idna_info /* = null */) {
+ return php_intl_idn_to(domain, options, (IdnVariant)variant, idna_info, INTL_IDN_TO_ASCII);
}
-Variant f_idn_to_unicode(CStrRef domain, VRefParam errorcode /* = null */) {
- return php_intl_idn_to(domain, ref(errorcode), INTL_IDN_TO_UTF8);
+Variant f_idn_to_unicode(CStrRef domain, int64 options /* = 0 */, int64 variant /* = 0 */, VRefParam idna_info /* = null */) {
+ return php_intl_idn_to(domain, options, (IdnVariant)variant, idna_info, INTL_IDN_TO_UTF8);
}
-Variant f_idn_to_utf8(CStrRef domain, VRefParam errorcode /* = null */) {
- return php_intl_idn_to(domain, ref(errorcode), INTL_IDN_TO_UTF8);
+Variant f_idn_to_utf8(CStrRef domain, int64 options /* = 0 */, int64 variant /* = 0 */, VRefParam idna_info /* = null */) {
+ return php_intl_idn_to(domain, options, (IdnVariant)variant, idna_info, INTL_IDN_TO_UTF8);
}
///////////////////////////////////////////////////////////////////////////////
Oops, something went wrong.

0 comments on commit b74a0da

Please sign in to comment.