Skip to content

Commit

Permalink
stri_length now works with ANY encoding :-)
Browse files Browse the repository at this point in the history
  • Loading branch information
gagolews committed Mar 10, 2013
1 parent 751ca4a commit 21e364b
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 20 deletions.
2 changes: 1 addition & 1 deletion inst/benchmarks/bmarks-length.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ require('stringr')


x <- str_dup('a', 0:100)
y <- str_dup('ą', 0:100)
y <- str_dup('\u0104', 0:100)

microbenchmark(
nchar(x), stri_length(x), str_length(x),
Expand Down
23 changes: 18 additions & 5 deletions inst/tests/test-length.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
require(testthat)

test_that("stri_numbytes", {
expect_equivalent(stri_numbytes(c(NA, '', ' ', 'abc', 'ąbć')), c(NA, 0, 1, 3, 5))
expect_equivalent(stri_numbytes(c(NA, '', ' ', 'abc', '\u0104B\u0106')), c(NA, 0, 1, 3, 5))
})


test_that("stri_length", {
expect_equivalent(stri_length(c(NA, '', ' ', 'abc', 'ąbć')), c(NA, 0, 1, 3, 3))
expect_equivalent(stri_length(c(NA, '', ' ', 'abc', '\u0104B\u0106')), c(NA, 0, 1, 3, 3))
})


test_that("stri_length-cjk", {
cjk_test <- intToUtf8(c(24120, 29992, 22283, 23383, 27161, 28310, 23383, 39636, 34920)) # '常用國字標準字體表'
expect_equivalent(stri_numbytes(cjk_test), 27)
expect_equivalent(stri_length(cjk_test), 9)

cjk_test_Big5 <- stri_encode(cjk_test, 'UTF-8', 'Big5')
expect_equivalent(stri_numbytes(cjk_test_Big5), 18)
oldenc <- stri_encinfo()$Name.friendly
stri_encset('Big5')
expect_equivalent(stri_length(cjk_test_Big5), 9)
stri_encset(oldenc)
})


test_that("stri_isempty", {
expect_equivalent(stri_isempty(c(NA, '', ' ', 'abc', 'ąbć')), c(NA, T, F, F, F))
expect_equivalent(stri_isempty(c(NA, '', ' ', 'abc', '\u0104B\u0106')), c(NA, T, F, F, F))
})


test_that("stri_width", {
expect_equivalent(stri_width(c(NA, '', ' ', 'abc', 'ąbć')), c(NA, 0, 1, 3, 3))
# expect_equivalent(stri_width(c(NA, '', ' ', 'abc', '\u0104B\u0106')), c(NA, 0, 1, 3, 3))
})
84 changes: 70 additions & 14 deletions src/length.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
* Note that ICU permits only strings of length < 2^31.
* @param s R character vector
* @return maximal number of bytes
* @version 0.1 (Marek Gagolewski)
*/
R_len_t stri__numbytes_max(SEXP s)
{
Expand All @@ -55,6 +56,7 @@ R_len_t stri__numbytes_max(SEXP s)
* Note that ICU permits only strings of length < 2^31.
* @param s R object coercible to a character vector
* @return integer vector
* @version 0.1 (Marcin Bujarski)
*/
SEXP stri_numbytes(SEXP s)
{
Expand Down Expand Up @@ -83,12 +85,27 @@ SEXP stri_numbytes(SEXP s)
* Note that ICU permits only strings of length < 2^31.
* @param s R character vector
* @return integer vector
* @version 0.1 (Bartek Tartanus)
* @version 0.2 (Marek Gagolewski) Multiple input encoding suppert
*/
SEXP stri_length(SEXP s)
{
s = stri_prepare_arg_string(s);
R_len_t ns = LENGTH(s);
SEXP ret;

UConverter* uconv = NULL;
bool uconv_8bit = false;
bool uconv_utf8 = false;

/* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */
#define STRI_LENGTH_CALCULATE_UTF8 \
const char* qc = CHAR(q); \
R_len_t j = 0; /* number of detected code points */ \
for (R_len_t i = 0; i < nq; j++) \
U8_FWD_1(qc, i, nq); \
retint[k] = j;

PROTECT(ret = allocVector(INTSXP, ns));
int* retint = INTEGER(ret);
for (R_len_t k = 0; k < ns; k++) {
Expand All @@ -106,12 +123,7 @@ SEXP stri_length(SEXP s)
else if (IS_BYTES(q))
error(MSG__BYTESENC);
else if (IS_UTF8(q)) {
/* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */
const char* qc = CHAR(q);
R_len_t j = 0; // number of detected code points
for (R_len_t i = 0; i < nq; j++)
U8_FWD_1(qc, i, nq);
retint[k] = j;
STRI_LENGTH_CALCULATE_UTF8
}
else { // Any encoding - detection needed
// UTF-8 strings can be fairly reliably recognized as such by a
Expand All @@ -121,18 +133,59 @@ SEXP stri_length(SEXP s)
// We have two possibilities here:
// 1. Auto detect encoding: Is this ASCII or UTF-8? If not => use Native
// This won't work correctly in some cases.
// e.g. (c4,85) represents Polish a with ogonek in UTF-8
// and A umlaut, Ellipsis in WINDOWS-1250
// e.g. (c4,85) represents ("Polish a with ogonek") in UTF-8
// and ("A umlaut", "Ellipsis") in WINDOWS-1250
// 2. Assume it's Native; this assumes the user working in an 8-bit environment
// would convert strings to UTF-8 manually if needed - I think is's
// a more reasonable approach (Native --> input via keyboard)

warning("TO DO");
retint[k] = nq; // tmp....................
if (!uconv) { // open ucnv on demand
uconv = stri__ucnv_open((const char*)NULL); // native decoder
if (!uconv) {
retint[k] = NA_INTEGER;
continue;
}
uconv_8bit = ((int)ucnv_getMaxCharSize(uconv) == 1);
if (!uconv_8bit) {
UErrorCode err = U_ZERO_ERROR;
const char* name = ucnv_getName(uconv, &err);
if (U_FAILURE(err))
error("could not query default converter");
uconv_utf8 = !strncmp("UTF-8", name, 5);
}
}

if (uconv_8bit) {
retint[k] = nq; // it's an 8-bit encoding :-)
}
else if (uconv_utf8) { // it's UTF-8
STRI_LENGTH_CALCULATE_UTF8
}
else { // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
UErrorCode err = U_ZERO_ERROR;
const char* source = CHAR(q);
const char* sourceLimit = source + nq;
R_len_t j;
for (j = 0; source != sourceLimit; j++) {
if (U_FAILURE(err)) break; // error from previous iteration
// iterate through each native-encoded character:
ucnv_getNextUChar(uconv, &source, sourceLimit, &err);
}
if (U_FAILURE(err)) { // error from last iteration
warning("error determining length for native, neither 8-bit- nor UTF-8-encoded string.");
retint[k] = NA_INTEGER;
}
else
retint[k] = j; // all right, we got it!
}
}
}
}
UNPROTECT(1);

if (uconv)
ucnv_close(uconv);

return ret;
}

Expand All @@ -143,6 +196,7 @@ SEXP stri_length(SEXP s)
* Note that ICU permits only strings of length < 2^31.
* @param s R character vector
* @return integer vector
* @version 0.1 (Marek Gagolewski)
*/
SEXP stri_isempty(SEXP s)
{
Expand All @@ -157,20 +211,22 @@ SEXP stri_isempty(SEXP s)
if (curs == NA_STRING)
retlog[i] = NA_LOGICAL;
else
retlog[i] = (CHAR(curs)[0] == '\0');
retlog[i] = (CHAR(curs)[0] == '\0'); // (LENGTH(curs) == 0); // slower?
}
UNPROTECT(1);
return ret;
}


/**
* Determine the width of the strint
* Determine the width of the string
* e.g. some chinese chars have width > 1.
*
* Note that ICU permits only strings of length < 2^31.
* @param s R character vector
* @return integer vector
* @version 0.1 (WHO?) TO BE DONE
* @todo THIS FUNCTION HAS NOT YET BEEN IMPLEMENTED
*/
SEXP stri_width(SEXP s)
{
Expand All @@ -181,9 +237,9 @@ SEXP stri_width(SEXP s)
PROTECT(ret = allocVector(INTSXP, ns));
int* retint = INTEGER(ret);

// @TODO ------------------------------------------------------------------------------------------------------
///< @TODO ------------------------------------------------------------------------------------------------------
error("TODO: the function has not yet been implemented.");

UNPROTECT(1);
return ret;
}
}

0 comments on commit 21e364b

Please sign in to comment.