cleanup for #108 and 98

gagolews · Oct 30, 2014 · 2c405a8 · 2c405a8
1 parent e962b01
commit 2c405a8
Show file tree

Hide file tree

Showing 34 changed files with 363 additions and 390 deletions.
diff --git a/NEWS b/NEWS
@@ -31,6 +31,9 @@
      is called on the resulting list.
 .....TO DO: stri_extract_words, stri_split_boundaries
 
+   * [NEW FUNCTION] #109: `stri_count_boundaries`...........
+.....
+
    * ....[NEW FUNCTIONS] #41: `stri_startswith_*` and `stri_endswith_*`
      determine whether a string starts or ends with a given pattern.
      [TO DO.........: documentation, coll, charclass, regex?, other]
@@ -45,7 +48,7 @@
 
    * [NEW FEATURE] #98: `stri_trans_totitle` gained a `opts_brkiter`
      parameter; it indicates which ICU BreakIterator should be used when
-     case mapping.
+     performing case mapping.
 
    * [NEW FEATURE] `stri_wrap` gained a new parameter: `normalize`
 

diff --git a/R/compare.R b/R/compare.R
@@ -439,8 +439,13 @@ stri_sort <-  function(str, decreasing=FALSE, na_last=NA, opts_collator=NULL) {
 #' @details
 #' As usual in \pkg{stringi}, no attributes are copied.
 #' Unlike \code{\link{unique}}, this function
-#' tests for canonical equivalence of strings. Such an operation
-#' is locale-dependent.
+#' tests for canonical equivalence of strings (and not
+#' whether the strings are just bytewise equal). Such an operation
+#' is locale-dependent. Hence, \code{stri_unique} is significantly
+#' slower (but much better suited for natural language processing)
+#' than its base R counterpart.
+#' 
+#' See also \code{\link{stri_duplicated}} for indicating non-unique elements.
 #'
 #' @param str character vector
 #' @param opts_collator a named list with \pkg{ICU} Collator's options
@@ -453,6 +458,10 @@ stri_sort <-  function(str, decreasing=FALSE, na_last=NA, opts_collator=NULL) {
 #' \donttest{
 #' # normalized and non-unicode-normalized version of the same code point:
 #' stri_unique(c("\u0105", stri_trans_nfkd("\u0105")))
+#' unique(c("\u0105", stri_trans_nfkd("\u0105")))
+#' 
+#' stri_unique(c("groß", "GROSS", "Groß", "Gross"), 
+#'    stri_opts_collator(strength=1))
 #' }
 #' 
 #' @family locale_sensitive
@@ -475,8 +484,15 @@ stri_unique <-  function(str, opts_collator=NULL) {
 #' @details
 #' Missing values are regarded as equal.
 #'
-#' These functions test for canonical equivalence of strings.
-#' Such an operation is locale-dependent.
+#' Unlike \code{\link{duplicated}} and \code{\link{anyDuplicated}},
+#' these functions test for canonical equivalence of strings
+#' (and not whether the strings are just bytewise equal)
+#' Such operations is locale-dependent.
+#' Hence, \code{stri_duplicated} and \code{stri_duplicated_any}
+#' are significantly slower (but much better suited for natural language
+#' processing) than their base R counterpart.
+#' 
+#' See also \code{\link{stri_unique}} for extracting unique elements.
 #'
 #' @param str character vector
 #' @param fromLast single logical value;
@@ -500,7 +516,16 @@ stri_unique <-  function(str, opts_collator=NULL) {
 #' # In the following examples, we have 3 duplicated values,
 #' # "a" - 2 times, NA - 1 time
 #' stri_duplicated(c("a", "b", "a", NA, "a", NA))
+#' stri_duplicated(c("a", "b", "a", NA, "a", NA), fromLast=TRUE)
 #' stri_duplicated_any(c("a", "b", "a", NA, "a", NA))
+#' 
+#' # compare the results:
+#' stri_duplicated(c("\u0105", stri_trans_nfkd("\u0105")))
+#' duplicated(c("\u0105", stri_trans_nfkd("\u0105")))
+#' 
+#' stri_duplicated(c("groß", "GROSS", "Groß", "Gross"), 
+#'    opts_collator=stri_opts_collator(strength=1))
+#' duplicated(c("groß", "GROSS", "Groß", "Gross"))
 #' }
 #'
 #' @rdname stri_duplicated

diff --git a/R/search_locate_bound.R b/R/search_locate_bound.R
@@ -43,13 +43,24 @@
 #' performed by \pkg{ICU}'s \code{BreakIterator}, see
 #' \link{stringi-search-boundaries}.
 #'
-#' For locating words in a text using \pkg{ICU}'s word iterator,
-#' see \code{\link{stri_locate_words}}.
+#' In case of \code{stri_locate_words},
+#' just like in \code{\link{stri_extract_words}},
+#' \pkg{ICU}'s word \code{BreakIterator} iterator is used
+#' to locate word boundaries, and all non-word characters
+#' (\code{UBRK_WORD_NONE} rule status) are ignored.
+#' This is function is equivalent to a call to 
+#' \code{\link{stri_locate_boundaries}(str, \link{stri_opts_brkiter}(type="word", skip_word_none=TRUE, locale=locale))}
 #'
 #'
 #' @param str character vector or an object coercible to
 #' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings
-#' as generated with \code{\link{stri_opts_brkiter}}
+#' as generated with \code{\link{stri_opts_brkiter}};
+#' \code{NULL} for default break iterator, i.e. \code{line_break};
+#' \code{stri_locate_boundaries} only
+#' @param locale \code{NULL} or \code{""} for text boundary analysis following
+#' the conventions of the default locale, or a single string with
+#' locale identifier, see \link{stringi-locale};
+#' \code{stri_locate_words} only
 #'
 #' @return
 #' A list of \code{length(str)} integer matrices
@@ -68,63 +79,22 @@
 #' stri_locate_boundaries(test, stri_opts_brkiter(type="word"))
 #' stri_locate_boundaries(test, stri_opts_brkiter(type="sentence"))
 #' stri_locate_boundaries(test, stri_opts_brkiter(type="character"))
+#' stri_locate_words(test)
 #' }
 #' 
 #' @export
 #' @family search_locate
 #' @family indexing
 #' @family locale_sensitive
 #' @family text_boundaries
-stri_locate_boundaries <- function(str, opts_brkiter=stri_opts_brkiter(type="line_break")) {
+#' @rdname stri_locate_boundaries
+stri_locate_boundaries <- function(str, opts_brkiter=NULL) {
    .Call("stri_locate_boundaries", str, opts_brkiter, PACKAGE="stringi")
 }
 
 
-#' @title
-#' Locate Words in a Text
-#'
-#' @description
-#' This function locates all words in each string.
-#'
-#' @details
-#' Vectorized over \code{str}.
-#'
-#' Just like in \code{\link{stri_extract_words}},
-#' \pkg{ICU}'s word \code{BreakIterator} iterator is used
-#' to locate word boundaries, and all non-word characters
-#' (\code{UBRK_WORD_NONE} rule status) are ignored.
-#' This is function is equivalent to a call to 
-#' \code{\link{stri_locate_boundaries}(str, \link{stri_opts_brkiter}(type="word", skip_word_none=TRUE, locale=locale))}
-#'
-#' @param str character vector or an object coercible to
-#' @param locale \code{NULL} or \code{""} for text boundary analysis following
-#' the conventions of the default locale, or a single string with
-#' locale identifier, see \link{stringi-locale}.
-#'
-#' @return
-#' A list of integer matrices
-#' is returned. The first column gives the start positions
-#' of the words, and the second column gives
-#' the end positions. The indices are code point-based, thus
-#' they may be passed e.g. to the \code{\link{stri_sub}} function.
-#'
-#' Moreover, you may get two \code{NA}s in one row
-#' for no match or \code{NA} arguments.
-#'
-#' @examples
-#' \donttest{
-#' stri_locate_words("  stringi: THE string processing package 123.48...  ")
-#' }
-#'
-#' @references
-#' \emph{Boundary Analysis} -- ICU User Guide,
-#' \url{http://userguide.icu-project.org/boundaryanalysis}
-#'
 #' @export
-#' @family search_locate
-#' @family indexing
-#' @family locale_sensitive
-#' @family text_boundaries
+#' @rdname stri_locate_boundaries
 stri_locate_words <- function(str, locale=NULL) {
    stri_locate_boundaries(str, stri_opts_brkiter(type="word", skip_word_none=TRUE, locale=locale))
 }
diff --git a/R/search_split_4.R b/R/search_split_4.R
@@ -56,7 +56,8 @@
 #' 
 #' Empty search patterns are not supported. If you would like to split a
 #' string into individual characters, use e.g.
-#' \code{\link{stri_extract_all_regex}(str, ".")}
+#' \code{\link{stri_split_boundaries}(str, 
+#' \link{stri_opts_brkiter}(type="character"))} for THE Unicode way.
 #' 
 #' \code{stri_split} is a convenience function.
 #' It calls either \code{stri_split_regex},

diff --git a/R/search_split_bound.R b/R/search_split_bound.R
@@ -117,7 +117,8 @@ stri_split_lines1 <- function(str) {
 #'
 #' @param str character vector or an object coercible to
 #' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings
-#' as generated with \code{\link{stri_opts_brkiter}}
+#' as generated with \code{\link{stri_opts_brkiter}};
+#' \code{NULL} for default break iterator, i.e. \code{line_break}
 #'
 #' @return
 #' Returns a list of character vectors.
@@ -139,6 +140,6 @@ stri_split_lines1 <- function(str) {
 #' @family search_split
 #' @family locale_sensitive
 #' @family text_boundaries
-stri_split_boundaries <- function(str, opts_brkiter=stri_opts_brkiter(type="line_break")) {
+stri_split_boundaries <- function(str, opts_brkiter=NULL) {
    .Call("stri_split_boundaries", str, opts_brkiter, PACKAGE="stringi")
 }
diff --git a/R/trans_casemap.R b/R/trans_casemap.R
@@ -37,6 +37,8 @@
 #' UPPER CASE, or to Title Case.
 #'
 #' @details
+#' Vectorized over \code{str}.
+#' 
 #' \pkg{ICU} implements full Unicode string case mappings. In general,
 #' \itemize{
 #' \item case mapping can change the number of code points and/or code units
@@ -46,23 +48,25 @@
 #'    differently depending on surrounding characters).
 #' }
 #' 
-#' With \code{stri_trans_totitle}, if \code{boundary} equal to \code{word}
+#' With \code{stri_trans_totitle}, if \code{word} \code{BreakIterator}
 #' is used (the default), then the first letter of each word will be capitalized
 #' and the rest will be transformed to lower case.
 #' With a break iterator of type \code{sentence}, the first letter
 #' of each sentence will be capitalized only.
-#' Note that according the \pkg{ICU} \code{BreakInterator}
+#' Note that according the \pkg{ICU} User Guide,
 #' the string \code{"one. two. three."} consists of one sentence.
 #'
 #' For more general (but not locale dependent)
 #' text transforms refer to \code{\link{stri_trans_general}}.
 #'
 #' @param str character vector
-#' @param boundary single character string, either \code{word}
-#' or \code{sentence}, gives the BreakIterator to use when titlecasing
 #' @param locale \code{NULL} or \code{""} for case mapping following
 #' the conventions of the default locale, or a single string with
 #' locale identifier, see \link{stringi-locale}.
+#' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings
+#' as generated with \code{\link{stri_opts_brkiter}};
+#' \code{NULL} for default break iterator, i.e. \code{word};
+#' \code{stri_trans_totitle} only
 #' 
 #' @return
 #' Each function returns a character vector.
@@ -81,26 +85,28 @@
 #' \donttest{
 #' stri_trans_toupper("\u00DF", "de_DE") # small German Eszett / scharfes S
 #' stri_cmp_eq(stri_trans_toupper("i", "en_US"), stri_trans_toupper("i", "tr_TR"))
-#' stri_trans_toupper(c('abc','123','\u0105\u0104'))
-#' stri_trans_tolower(c('AbC','123','\u0105\u0104'))
-#' stri_trans_totitle(c('AbC','123','\u0105\u0104'))
-#' stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!", boundary="word") # default boundary
-#' stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!", boundary="sentence")
+#' stri_trans_toupper(c('abc', '123', '\u0105\u0104'))
+#' stri_trans_tolower(c('AbC', '123', '\u0105\u0104'))
+#' stri_trans_totitle(c('AbC', '123', '\u0105\u0104'))
+#' stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!",
+#'     stri_opts_brkiter(type="word")) # default boundary
+#' stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!",
+#'     stri_opts_brkiter(type="sentence"))
 #' }
 stri_trans_tolower <- function(str, locale=NULL) {
-   .Call("stri_trans_casemap", str, 1L, "", locale, PACKAGE="stringi")
+   .Call("stri_trans_casemap", str, 1L, locale, PACKAGE="stringi")
 }
 
 
 #' @export
 #' @rdname stri_trans_casemap
 stri_trans_toupper <- function(str, locale=NULL) {
-   .Call("stri_trans_casemap", str, 2L, "", locale, PACKAGE="stringi")
+   .Call("stri_trans_casemap", str, 2L, locale, PACKAGE="stringi")
 }
 
 
 #' @export
 #' @rdname stri_trans_casemap
-stri_trans_totitle <- function(str, boundary="word", locale=NULL) {
-   .Call("stri_trans_casemap", str, 3L, boundary, locale, PACKAGE="stringi")
+stri_trans_totitle <- function(str, opts_brkiter=NULL) {
+   .Call("stri_trans_casemap", str, 3L, opts_brkiter, PACKAGE="stringi")
 }
diff --git a/devel/testthat/test-trans-casemap.R b/devel/testthat/test-trans-casemap.R
@@ -68,12 +68,17 @@ test_that("stri_trans_totitle", {
    expect_equivalent(stri_trans_totitle(letters), LETTERS)
    expect_equivalent(stri_trans_totitle(stri_flatten(letters)), stri_flatten(c("A",letters[-1])))
 
-   expect_equivalent(stri_trans_totitle("\u0105\u0104", locale="pl_PL"), "\u0104\u0105")
+   expect_equivalent(stri_trans_totitle("\u0105\u0104"), "\u0104\u0105")
 
    expect_equivalent(stri_trans_totitle("ala   ma   kota"), "Ala   Ma   Kota")
    expect_equivalent(stri_trans_totitle("ala\tma\tkota"), "Ala\tMa\tKota")
    expect_equivalent(stri_trans_totitle("ala\nma\nKota"), "Ala\nMa\nKota")
    #totitle(totitle(x))==totitle(x)
    expect_equivalent(stri_trans_totitle(stri_trans_totitle("Ala\nMa\nKota")),
                      stri_trans_totitle("Ala\nMa\nKota"))
+
+   expect_equivalent(stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!",
+      stri_opts_brkiter(type="word")), "Good-Old Cookie Monster Is Watching You. Here He Comes!")
+   expect_equivalent(stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!",
+      stri_opts_brkiter(type="sentence")), "Good-old cookie monster is watching you. Here he comes!")
 })
diff --git a/man/oper_comparison.Rd b/man/oper_comparison.Rd
@@ -90,7 +90,7 @@ Other locale_sensitive: \code{\link{stri_cmp}},
   \code{\link{stri_duplicated_any}};
   \code{\link{stri_enc_detect2}};
   \code{\link{stri_extract_words}};
-  \code{\link{stri_locate_boundaries}};
+  \code{\link{stri_locate_boundaries}},
   \code{\link{stri_locate_words}};
   \code{\link{stri_opts_collator}};
   \code{\link{stri_order}}, \code{\link{stri_sort}};

diff --git a/man/stri_compare.Rd b/man/stri_compare.Rd
@@ -139,7 +139,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
   \code{\link{stri_duplicated_any}};
   \code{\link{stri_enc_detect2}};
   \code{\link{stri_extract_words}};
-  \code{\link{stri_locate_boundaries}};
+  \code{\link{stri_locate_boundaries}},
   \code{\link{stri_locate_words}};
   \code{\link{stri_opts_collator}};
   \code{\link{stri_order}}, \code{\link{stri_sort}};

diff --git a/man/stri_duplicated.Rd b/man/stri_duplicated.Rd
@@ -38,15 +38,31 @@ strings in a character vector.
 \details{
 Missing values are regarded as equal.
 
-These functions test for canonical equivalence of strings.
-Such an operation is locale-dependent.
+Unlike \code{\link{duplicated}} and \code{\link{anyDuplicated}},
+these functions test for canonical equivalence of strings
+(and not whether the strings are just bytewise equal)
+Such operations is locale-dependent.
+Hence, \code{stri_duplicated} and \code{stri_duplicated_any}
+are significantly slower (but much better suited for natural language
+processing) than their base R counterpart.
+
+See also \code{\link{stri_unique}} for extracting unique elements.
 }
 \examples{
 \donttest{
 # In the following examples, we have 3 duplicated values,
 # "a" - 2 times, NA - 1 time
 stri_duplicated(c("a", "b", "a", NA, "a", NA))
+stri_duplicated(c("a", "b", "a", NA, "a", NA), fromLast=TRUE)
 stri_duplicated_any(c("a", "b", "a", NA, "a", NA))
+
+# compare the results:
+stri_duplicated(c("\\u0105", stri_trans_nfkd("\\u0105")))
+duplicated(c("\\u0105", stri_trans_nfkd("\\u0105")))
+
+stri_duplicated(c("groß", "GROSS", "Groß", "Gross"),
+   opts_collator=stri_opts_collator(strength=1))
+duplicated(c("groß", "GROSS", "Groß", "Gross"))
 }
 }
 \seealso{
@@ -67,7 +83,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
   \code{\link{stri_compare}};
   \code{\link{stri_enc_detect2}};
   \code{\link{stri_extract_words}};
-  \code{\link{stri_locate_boundaries}};
+  \code{\link{stri_locate_boundaries}},
   \code{\link{stri_locate_words}};
   \code{\link{stri_opts_collator}};
   \code{\link{stri_order}}, \code{\link{stri_sort}};
@@ -97,7 +113,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
   \code{\link{stri_compare}};
   \code{\link{stri_enc_detect2}};
   \code{\link{stri_extract_words}};
-  \code{\link{stri_locate_boundaries}};
+  \code{\link{stri_locate_boundaries}},
   \code{\link{stri_locate_words}};
   \code{\link{stri_opts_collator}};
   \code{\link{stri_order}}, \code{\link{stri_sort}};

diff --git a/man/stri_enc_detect2.Rd b/man/stri_enc_detect2.Rd
@@ -92,7 +92,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
   \code{\link{stri_duplicated}},
   \code{\link{stri_duplicated_any}};
   \code{\link{stri_extract_words}};
-  \code{\link{stri_locate_boundaries}};
+  \code{\link{stri_locate_boundaries}},
   \code{\link{stri_locate_words}};
   \code{\link{stri_opts_collator}};
   \code{\link{stri_order}}, \code{\link{stri_sort}};

diff --git a/man/stri_extract_words.Rd b/man/stri_extract_words.Rd
@@ -54,7 +54,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
   \code{\link{stri_duplicated}},
   \code{\link{stri_duplicated_any}};
   \code{\link{stri_enc_detect2}};
-  \code{\link{stri_locate_boundaries}};
+  \code{\link{stri_locate_boundaries}},
   \code{\link{stri_locate_words}};
   \code{\link{stri_opts_collator}};
   \code{\link{stri_order}}, \code{\link{stri_sort}};
@@ -88,7 +88,7 @@ Other search_extract: \code{\link{stri_extract}},
   \code{\link{stri_match_last_regex}};
   \code{\link{stringi-search}}
 
-Other text_boundaries: \code{\link{stri_locate_boundaries}};
+Other text_boundaries: \code{\link{stri_locate_boundaries}},
   \code{\link{stri_locate_words}};
   \code{\link{stri_opts_brkiter}};
   \code{\link{stri_split_boundaries}};