stri_length now works with ANY encoding :-)

gagolews · Mar 10, 2013 · 21e364b · 21e364b
1 parent 751ca4a
commit 21e364b
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 20 deletions.
diff --git a/inst/benchmarks/bmarks-length.R b/inst/benchmarks/bmarks-length.R
@@ -3,7 +3,7 @@ require('stringr')
 
 
 x <- str_dup('a', 0:100)
-y <- str_dup('ą', 0:100)
+y <- str_dup('\u0104', 0:100)
 
 microbenchmark(
    nchar(x), stri_length(x), str_length(x),

diff --git a/inst/tests/test-length.R b/inst/tests/test-length.R
@@ -1,20 +1,33 @@
 require(testthat)
 
 test_that("stri_numbytes", {
-   expect_equivalent(stri_numbytes(c(NA, '', ' ', 'abc', 'ąbć')), c(NA, 0, 1, 3, 5))
+   expect_equivalent(stri_numbytes(c(NA, '', ' ', 'abc', '\u0104B\u0106')), c(NA, 0, 1, 3, 5))
 })
-
 
 test_that("stri_length", {
-   expect_equivalent(stri_length(c(NA, '', ' ', 'abc', 'ąbć')), c(NA, 0, 1, 3, 3))
+   expect_equivalent(stri_length(c(NA, '', ' ', 'abc', '\u0104B\u0106')), c(NA, 0, 1, 3, 3))
+})
+
+
+test_that("stri_length-cjk", {
+   cjk_test <- intToUtf8(c(24120, 29992, 22283, 23383, 27161, 28310, 23383, 39636, 34920)) # '常用國字標準字體表'
+   expect_equivalent(stri_numbytes(cjk_test), 27)
+   expect_equivalent(stri_length(cjk_test), 9)
+
+   cjk_test_Big5 <- stri_encode(cjk_test, 'UTF-8', 'Big5')
+   expect_equivalent(stri_numbytes(cjk_test_Big5), 18)
+   oldenc <- stri_encinfo()$Name.friendly
+   stri_encset('Big5')
+   expect_equivalent(stri_length(cjk_test_Big5), 9)
+   stri_encset(oldenc)
 })
 
 
 test_that("stri_isempty", {
-   expect_equivalent(stri_isempty(c(NA, '', ' ', 'abc', 'ąbć')), c(NA, T, F, F, F))
+   expect_equivalent(stri_isempty(c(NA, '', ' ', 'abc', '\u0104B\u0106')), c(NA, T, F, F, F))
 })
 
 
 test_that("stri_width", {
-   expect_equivalent(stri_width(c(NA, '', ' ', 'abc', 'ąbć')), c(NA, 0, 1, 3, 3))
+#    expect_equivalent(stri_width(c(NA, '', ' ', 'abc', '\u0104B\u0106')), c(NA, 0, 1, 3, 3))
 })
diff --git a/src/length.cpp b/src/length.cpp
@@ -30,6 +30,7 @@
  * Note that ICU permits only strings of length < 2^31.
  * @param s R character vector
  * @return maximal number of bytes
+ * @version 0.1 (Marek Gagolewski)
  */
 R_len_t stri__numbytes_max(SEXP s)
 {
@@ -55,6 +56,7 @@ R_len_t stri__numbytes_max(SEXP s)
  * Note that ICU permits only strings of length < 2^31.
  * @param s R object coercible to a character vector
  * @return integer vector
+ * @version 0.1 (Marcin Bujarski)
  */
 SEXP stri_numbytes(SEXP s)
 {
@@ -83,12 +85,27 @@ SEXP stri_numbytes(SEXP s)
  * Note that ICU permits only strings of length < 2^31.
  * @param s R character vector
  * @return integer vector
+ * @version 0.1 (Bartek Tartanus)
+ * @version 0.2 (Marek Gagolewski) Multiple input encoding suppert
  */
 SEXP stri_length(SEXP s)
 {
    s = stri_prepare_arg_string(s);
    R_len_t ns = LENGTH(s);
    SEXP ret;
+
+   UConverter* uconv = NULL;
+   bool uconv_8bit = false;
+   bool uconv_utf8 = false;
+
+/* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */
+#define STRI_LENGTH_CALCULATE_UTF8 \
+   const char* qc = CHAR(q); \
+   R_len_t j = 0;  /* number of detected code points */ \    
+   for (R_len_t i = 0; i < nq; j++) \
+      U8_FWD_1(qc, i, nq); \
+   retint[k] = j;
+
    PROTECT(ret = allocVector(INTSXP, ns));
    int* retint = INTEGER(ret);   
    for (R_len_t k = 0; k < ns; k++) {
@@ -106,12 +123,7 @@ SEXP stri_length(SEXP s)
          else if (IS_BYTES(q)) 
             error(MSG__BYTESENC);
          else if (IS_UTF8(q)) {
-            /* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */
-            const char* qc = CHAR(q);
-            R_len_t j = 0;      // number of detected code points
-            for (R_len_t i = 0; i < nq; j++)
-                U8_FWD_1(qc, i, nq);
-            retint[k] = j;
+            STRI_LENGTH_CALCULATE_UTF8
          }
          else { // Any encoding - detection needed
 //            UTF-8 strings can be fairly reliably recognized as such by a
@@ -121,18 +133,59 @@ SEXP stri_length(SEXP s)
 //            We have two possibilities here:
 //            1. Auto detect encoding: Is this ASCII or UTF-8? If not => use Native
 //                This won't work correctly in some cases.
-//                e.g. (c4,85) represents Polish a with ogonek in UTF-8
-//                and A umlaut, Ellipsis in WINDOWS-1250
+//                e.g. (c4,85) represents ("Polish a with ogonek") in UTF-8
+//                and ("A umlaut", "Ellipsis") in WINDOWS-1250
 //            2. Assume it's Native; this assumes the user working in an 8-bit environment
 //                would convert strings to UTF-8 manually if needed - I think is's
 //                a more reasonable approach (Native --> input via keyboard)
 
-            warning("TO DO");
-            retint[k] = nq; // tmp....................
+            if (!uconv) { // open ucnv on demand
+               uconv = stri__ucnv_open((const char*)NULL); // native decoder
+               if (!uconv) {
+                  retint[k] = NA_INTEGER;
+                  continue;
+               }
+               uconv_8bit = ((int)ucnv_getMaxCharSize(uconv) == 1);
+               if (!uconv_8bit) {
+                  UErrorCode err = U_ZERO_ERROR;
+                  const char* name = ucnv_getName(uconv, &err);
+                  if (U_FAILURE(err))
+                     error("could not query default converter");
+                  uconv_utf8 = !strncmp("UTF-8", name, 5);
+               }
+            }
+
+            if (uconv_8bit) {
+               retint[k] = nq; // it's an 8-bit encoding :-)
+            }
+            else if (uconv_utf8) { // it's UTF-8
+               STRI_LENGTH_CALCULATE_UTF8
+            }
+            else {  // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
+               UErrorCode err = U_ZERO_ERROR;
+               const char* source = CHAR(q);
+               const char* sourceLimit = source + nq;
+               R_len_t j;
+               for (j = 0; source != sourceLimit; j++) {
+                  if (U_FAILURE(err)) break; // error from previous iteration
+                  // iterate through each native-encoded character:
+                  ucnv_getNextUChar(uconv, &source, sourceLimit, &err);
+               }
+               if (U_FAILURE(err)) {  // error from last iteration
+                  warning("error determining length for native, neither 8-bit- nor UTF-8-encoded string.");
+                  retint[k] = NA_INTEGER;
+               }
+               else
+                  retint[k] = j; // all right, we got it!
+            }
          }
       }
    }
    UNPROTECT(1);
+
+   if (uconv)
+      ucnv_close(uconv);
+
    return ret;
 }
 
@@ -143,6 +196,7 @@ SEXP stri_length(SEXP s)
  * Note that ICU permits only strings of length < 2^31.
  * @param s R character vector
  * @return integer vector
+ * @version 0.1 (Marek Gagolewski)
  */
 SEXP stri_isempty(SEXP s)
 {
@@ -157,20 +211,22 @@ SEXP stri_isempty(SEXP s)
       if (curs == NA_STRING)
          retlog[i] = NA_LOGICAL;
       else
-         retlog[i] = (CHAR(curs)[0] == '\0');
+         retlog[i] = (CHAR(curs)[0] == '\0'); // (LENGTH(curs) == 0); // slower?
    }
    UNPROTECT(1);
    return ret;  
 }
 
 
 /**
- * Determine the width of the strint
+ * Determine the width of the string
  * e.g. some chinese chars have width > 1.
  * 
  * Note that ICU permits only strings of length < 2^31.
  * @param s R character vector
  * @return integer vector
+ * @version 0.1 (WHO?) TO BE DONE
+ * @todo THIS FUNCTION HAS NOT YET BEEN IMPLEMENTED
  */
 SEXP stri_width(SEXP s)
 {
@@ -181,9 +237,9 @@ SEXP stri_width(SEXP s)
    PROTECT(ret = allocVector(INTSXP, ns));
    int* retint = INTEGER(ret);   
 
-   // @TODO ------------------------------------------------------------------------------------------------------
+   ///< @TODO ------------------------------------------------------------------------------------------------------
    error("TODO: the function has not yet been implemented.");
 
    UNPROTECT(1);
    return ret;
-}
+}