Skip to content

Commit

Permalink
cleanup for #108 and 98
Browse files Browse the repository at this point in the history
  • Loading branch information
gagolews committed Oct 30, 2014
1 parent e962b01 commit 2c405a8
Show file tree
Hide file tree
Showing 34 changed files with 363 additions and 390 deletions.
5 changes: 4 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
is called on the resulting list.
.....TO DO: stri_extract_words, stri_split_boundaries

* [NEW FUNCTION] #109: `stri_count_boundaries`...........
.....

* ....[NEW FUNCTIONS] #41: `stri_startswith_*` and `stri_endswith_*`
determine whether a string starts or ends with a given pattern.
[TO DO.........: documentation, coll, charclass, regex?, other]
Expand All @@ -45,7 +48,7 @@

* [NEW FEATURE] #98: `stri_trans_totitle` gained a `opts_brkiter`
parameter; it indicates which ICU BreakIterator should be used when
case mapping.
performing case mapping.

* [NEW FEATURE] `stri_wrap` gained a new parameter: `normalize`

Expand Down
33 changes: 29 additions & 4 deletions R/compare.R
Original file line number Diff line number Diff line change
Expand Up @@ -439,8 +439,13 @@ stri_sort <- function(str, decreasing=FALSE, na_last=NA, opts_collator=NULL) {
#' @details
#' As usual in \pkg{stringi}, no attributes are copied.
#' Unlike \code{\link{unique}}, this function
#' tests for canonical equivalence of strings. Such an operation
#' is locale-dependent.
#' tests for canonical equivalence of strings (and not
#' whether the strings are just bytewise equal). Such an operation
#' is locale-dependent. Hence, \code{stri_unique} is significantly
#' slower (but much better suited for natural language processing)
#' than its base R counterpart.
#'
#' See also \code{\link{stri_duplicated}} for indicating non-unique elements.
#'
#' @param str character vector
#' @param opts_collator a named list with \pkg{ICU} Collator's options
Expand All @@ -453,6 +458,10 @@ stri_sort <- function(str, decreasing=FALSE, na_last=NA, opts_collator=NULL) {
#' \donttest{
#' # normalized and non-unicode-normalized version of the same code point:
#' stri_unique(c("\u0105", stri_trans_nfkd("\u0105")))
#' unique(c("\u0105", stri_trans_nfkd("\u0105")))
#'
#' stri_unique(c("groß", "GROSS", "Groß", "Gross"),
#' stri_opts_collator(strength=1))
#' }
#'
#' @family locale_sensitive
Expand All @@ -475,8 +484,15 @@ stri_unique <- function(str, opts_collator=NULL) {
#' @details
#' Missing values are regarded as equal.
#'
#' These functions test for canonical equivalence of strings.
#' Such an operation is locale-dependent.
#' Unlike \code{\link{duplicated}} and \code{\link{anyDuplicated}},
#' these functions test for canonical equivalence of strings
#' (and not whether the strings are just bytewise equal)
#' Such operations is locale-dependent.
#' Hence, \code{stri_duplicated} and \code{stri_duplicated_any}
#' are significantly slower (but much better suited for natural language
#' processing) than their base R counterpart.
#'
#' See also \code{\link{stri_unique}} for extracting unique elements.
#'
#' @param str character vector
#' @param fromLast single logical value;
Expand All @@ -500,7 +516,16 @@ stri_unique <- function(str, opts_collator=NULL) {
#' # In the following examples, we have 3 duplicated values,
#' # "a" - 2 times, NA - 1 time
#' stri_duplicated(c("a", "b", "a", NA, "a", NA))
#' stri_duplicated(c("a", "b", "a", NA, "a", NA), fromLast=TRUE)
#' stri_duplicated_any(c("a", "b", "a", NA, "a", NA))
#'
#' # compare the results:
#' stri_duplicated(c("\u0105", stri_trans_nfkd("\u0105")))
#' duplicated(c("\u0105", stri_trans_nfkd("\u0105")))
#'
#' stri_duplicated(c("groß", "GROSS", "Groß", "Gross"),
#' opts_collator=stri_opts_collator(strength=1))
#' duplicated(c("groß", "GROSS", "Groß", "Gross"))
#' }
#'
#' @rdname stri_duplicated
Expand Down
66 changes: 18 additions & 48 deletions R/search_locate_bound.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,24 @@
#' performed by \pkg{ICU}'s \code{BreakIterator}, see
#' \link{stringi-search-boundaries}.
#'
#' For locating words in a text using \pkg{ICU}'s word iterator,
#' see \code{\link{stri_locate_words}}.
#' In case of \code{stri_locate_words},
#' just like in \code{\link{stri_extract_words}},
#' \pkg{ICU}'s word \code{BreakIterator} iterator is used
#' to locate word boundaries, and all non-word characters
#' (\code{UBRK_WORD_NONE} rule status) are ignored.
#' This is function is equivalent to a call to
#' \code{\link{stri_locate_boundaries}(str, \link{stri_opts_brkiter}(type="word", skip_word_none=TRUE, locale=locale))}
#'
#'
#' @param str character vector or an object coercible to
#' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings
#' as generated with \code{\link{stri_opts_brkiter}}
#' as generated with \code{\link{stri_opts_brkiter}};
#' \code{NULL} for default break iterator, i.e. \code{line_break};
#' \code{stri_locate_boundaries} only
#' @param locale \code{NULL} or \code{""} for text boundary analysis following
#' the conventions of the default locale, or a single string with
#' locale identifier, see \link{stringi-locale};
#' \code{stri_locate_words} only
#'
#' @return
#' A list of \code{length(str)} integer matrices
Expand All @@ -68,63 +79,22 @@
#' stri_locate_boundaries(test, stri_opts_brkiter(type="word"))
#' stri_locate_boundaries(test, stri_opts_brkiter(type="sentence"))
#' stri_locate_boundaries(test, stri_opts_brkiter(type="character"))
#' stri_locate_words(test)
#' }
#'
#' @export
#' @family search_locate
#' @family indexing
#' @family locale_sensitive
#' @family text_boundaries
stri_locate_boundaries <- function(str, opts_brkiter=stri_opts_brkiter(type="line_break")) {
#' @rdname stri_locate_boundaries
stri_locate_boundaries <- function(str, opts_brkiter=NULL) {
.Call("stri_locate_boundaries", str, opts_brkiter, PACKAGE="stringi")
}


#' @title
#' Locate Words in a Text
#'
#' @description
#' This function locates all words in each string.
#'
#' @details
#' Vectorized over \code{str}.
#'
#' Just like in \code{\link{stri_extract_words}},
#' \pkg{ICU}'s word \code{BreakIterator} iterator is used
#' to locate word boundaries, and all non-word characters
#' (\code{UBRK_WORD_NONE} rule status) are ignored.
#' This is function is equivalent to a call to
#' \code{\link{stri_locate_boundaries}(str, \link{stri_opts_brkiter}(type="word", skip_word_none=TRUE, locale=locale))}
#'
#' @param str character vector or an object coercible to
#' @param locale \code{NULL} or \code{""} for text boundary analysis following
#' the conventions of the default locale, or a single string with
#' locale identifier, see \link{stringi-locale}.
#'
#' @return
#' A list of integer matrices
#' is returned. The first column gives the start positions
#' of the words, and the second column gives
#' the end positions. The indices are code point-based, thus
#' they may be passed e.g. to the \code{\link{stri_sub}} function.
#'
#' Moreover, you may get two \code{NA}s in one row
#' for no match or \code{NA} arguments.
#'
#' @examples
#' \donttest{
#' stri_locate_words(" stringi: THE string processing package 123.48... ")
#' }
#'
#' @references
#' \emph{Boundary Analysis} -- ICU User Guide,
#' \url{http://userguide.icu-project.org/boundaryanalysis}
#'
#' @export
#' @family search_locate
#' @family indexing
#' @family locale_sensitive
#' @family text_boundaries
#' @rdname stri_locate_boundaries
stri_locate_words <- function(str, locale=NULL) {
stri_locate_boundaries(str, stri_opts_brkiter(type="word", skip_word_none=TRUE, locale=locale))
}
3 changes: 2 additions & 1 deletion R/search_split_4.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@
#'
#' Empty search patterns are not supported. If you would like to split a
#' string into individual characters, use e.g.
#' \code{\link{stri_extract_all_regex}(str, ".")}
#' \code{\link{stri_split_boundaries}(str,
#' \link{stri_opts_brkiter}(type="character"))} for THE Unicode way.
#'
#' \code{stri_split} is a convenience function.
#' It calls either \code{stri_split_regex},
Expand Down
5 changes: 3 additions & 2 deletions R/search_split_bound.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ stri_split_lines1 <- function(str) {
#'
#' @param str character vector or an object coercible to
#' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings
#' as generated with \code{\link{stri_opts_brkiter}}
#' as generated with \code{\link{stri_opts_brkiter}};
#' \code{NULL} for default break iterator, i.e. \code{line_break}
#'
#' @return
#' Returns a list of character vectors.
Expand All @@ -139,6 +140,6 @@ stri_split_lines1 <- function(str) {
#' @family search_split
#' @family locale_sensitive
#' @family text_boundaries
stri_split_boundaries <- function(str, opts_brkiter=stri_opts_brkiter(type="line_break")) {
stri_split_boundaries <- function(str, opts_brkiter=NULL) {
.Call("stri_split_boundaries", str, opts_brkiter, PACKAGE="stringi")
}
32 changes: 19 additions & 13 deletions R/trans_casemap.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#' UPPER CASE, or to Title Case.
#'
#' @details
#' Vectorized over \code{str}.
#'
#' \pkg{ICU} implements full Unicode string case mappings. In general,
#' \itemize{
#' \item case mapping can change the number of code points and/or code units
Expand All @@ -46,23 +48,25 @@
#' differently depending on surrounding characters).
#' }
#'
#' With \code{stri_trans_totitle}, if \code{boundary} equal to \code{word}
#' With \code{stri_trans_totitle}, if \code{word} \code{BreakIterator}
#' is used (the default), then the first letter of each word will be capitalized
#' and the rest will be transformed to lower case.
#' With a break iterator of type \code{sentence}, the first letter
#' of each sentence will be capitalized only.
#' Note that according the \pkg{ICU} \code{BreakInterator}
#' Note that according the \pkg{ICU} User Guide,
#' the string \code{"one. two. three."} consists of one sentence.
#'
#' For more general (but not locale dependent)
#' text transforms refer to \code{\link{stri_trans_general}}.
#'
#' @param str character vector
#' @param boundary single character string, either \code{word}
#' or \code{sentence}, gives the BreakIterator to use when titlecasing
#' @param locale \code{NULL} or \code{""} for case mapping following
#' the conventions of the default locale, or a single string with
#' locale identifier, see \link{stringi-locale}.
#' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings
#' as generated with \code{\link{stri_opts_brkiter}};
#' \code{NULL} for default break iterator, i.e. \code{word};
#' \code{stri_trans_totitle} only
#'
#' @return
#' Each function returns a character vector.
Expand All @@ -81,26 +85,28 @@
#' \donttest{
#' stri_trans_toupper("\u00DF", "de_DE") # small German Eszett / scharfes S
#' stri_cmp_eq(stri_trans_toupper("i", "en_US"), stri_trans_toupper("i", "tr_TR"))
#' stri_trans_toupper(c('abc','123','\u0105\u0104'))
#' stri_trans_tolower(c('AbC','123','\u0105\u0104'))
#' stri_trans_totitle(c('AbC','123','\u0105\u0104'))
#' stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!", boundary="word") # default boundary
#' stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!", boundary="sentence")
#' stri_trans_toupper(c('abc', '123', '\u0105\u0104'))
#' stri_trans_tolower(c('AbC', '123', '\u0105\u0104'))
#' stri_trans_totitle(c('AbC', '123', '\u0105\u0104'))
#' stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!",
#' stri_opts_brkiter(type="word")) # default boundary
#' stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!",
#' stri_opts_brkiter(type="sentence"))
#' }
stri_trans_tolower <- function(str, locale=NULL) {
.Call("stri_trans_casemap", str, 1L, "", locale, PACKAGE="stringi")
.Call("stri_trans_casemap", str, 1L, locale, PACKAGE="stringi")
}


#' @export
#' @rdname stri_trans_casemap
stri_trans_toupper <- function(str, locale=NULL) {
.Call("stri_trans_casemap", str, 2L, "", locale, PACKAGE="stringi")
.Call("stri_trans_casemap", str, 2L, locale, PACKAGE="stringi")
}


#' @export
#' @rdname stri_trans_casemap
stri_trans_totitle <- function(str, boundary="word", locale=NULL) {
.Call("stri_trans_casemap", str, 3L, boundary, locale, PACKAGE="stringi")
stri_trans_totitle <- function(str, opts_brkiter=NULL) {
.Call("stri_trans_casemap", str, 3L, opts_brkiter, PACKAGE="stringi")
}
7 changes: 6 additions & 1 deletion devel/testthat/test-trans-casemap.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,17 @@ test_that("stri_trans_totitle", {
expect_equivalent(stri_trans_totitle(letters), LETTERS)
expect_equivalent(stri_trans_totitle(stri_flatten(letters)), stri_flatten(c("A",letters[-1])))

expect_equivalent(stri_trans_totitle("\u0105\u0104", locale="pl_PL"), "\u0104\u0105")
expect_equivalent(stri_trans_totitle("\u0105\u0104"), "\u0104\u0105")

expect_equivalent(stri_trans_totitle("ala ma kota"), "Ala Ma Kota")
expect_equivalent(stri_trans_totitle("ala\tma\tkota"), "Ala\tMa\tKota")
expect_equivalent(stri_trans_totitle("ala\nma\nKota"), "Ala\nMa\nKota")
#totitle(totitle(x))==totitle(x)
expect_equivalent(stri_trans_totitle(stri_trans_totitle("Ala\nMa\nKota")),
stri_trans_totitle("Ala\nMa\nKota"))

expect_equivalent(stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!",
stri_opts_brkiter(type="word")), "Good-Old Cookie Monster Is Watching You. Here He Comes!")
expect_equivalent(stri_trans_totitle("GOOD-OLD cOOkiE mOnSTeR IS watCHinG You. Here HE comes!",
stri_opts_brkiter(type="sentence")), "Good-old cookie monster is watching you. Here he comes!")
})
2 changes: 1 addition & 1 deletion man/oper_comparison.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ Other locale_sensitive: \code{\link{stri_cmp}},
\code{\link{stri_duplicated_any}};
\code{\link{stri_enc_detect2}};
\code{\link{stri_extract_words}};
\code{\link{stri_locate_boundaries}};
\code{\link{stri_locate_boundaries}},
\code{\link{stri_locate_words}};
\code{\link{stri_opts_collator}};
\code{\link{stri_order}}, \code{\link{stri_sort}};
Expand Down
2 changes: 1 addition & 1 deletion man/stri_compare.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
\code{\link{stri_duplicated_any}};
\code{\link{stri_enc_detect2}};
\code{\link{stri_extract_words}};
\code{\link{stri_locate_boundaries}};
\code{\link{stri_locate_boundaries}},
\code{\link{stri_locate_words}};
\code{\link{stri_opts_collator}};
\code{\link{stri_order}}, \code{\link{stri_sort}};
Expand Down
24 changes: 20 additions & 4 deletions man/stri_duplicated.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,31 @@ strings in a character vector.
\details{
Missing values are regarded as equal.
These functions test for canonical equivalence of strings.
Such an operation is locale-dependent.
Unlike \code{\link{duplicated}} and \code{\link{anyDuplicated}},
these functions test for canonical equivalence of strings
(and not whether the strings are just bytewise equal)
Such operations is locale-dependent.
Hence, \code{stri_duplicated} and \code{stri_duplicated_any}
are significantly slower (but much better suited for natural language
processing) than their base R counterpart.
See also \code{\link{stri_unique}} for extracting unique elements.
}
\examples{
\donttest{
# In the following examples, we have 3 duplicated values,
# "a" - 2 times, NA - 1 time
stri_duplicated(c("a", "b", "a", NA, "a", NA))
stri_duplicated(c("a", "b", "a", NA, "a", NA), fromLast=TRUE)
stri_duplicated_any(c("a", "b", "a", NA, "a", NA))
# compare the results:
stri_duplicated(c("\\u0105", stri_trans_nfkd("\\u0105")))
duplicated(c("\\u0105", stri_trans_nfkd("\\u0105")))
stri_duplicated(c("groß", "GROSS", "Groß", "Gross"),
opts_collator=stri_opts_collator(strength=1))
duplicated(c("groß", "GROSS", "Groß", "Gross"))
}
}
\seealso{
Expand All @@ -67,7 +83,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
\code{\link{stri_compare}};
\code{\link{stri_enc_detect2}};
\code{\link{stri_extract_words}};
\code{\link{stri_locate_boundaries}};
\code{\link{stri_locate_boundaries}},
\code{\link{stri_locate_words}};
\code{\link{stri_opts_collator}};
\code{\link{stri_order}}, \code{\link{stri_sort}};
Expand Down Expand Up @@ -97,7 +113,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
\code{\link{stri_compare}};
\code{\link{stri_enc_detect2}};
\code{\link{stri_extract_words}};
\code{\link{stri_locate_boundaries}};
\code{\link{stri_locate_boundaries}},
\code{\link{stri_locate_words}};
\code{\link{stri_opts_collator}};
\code{\link{stri_order}}, \code{\link{stri_sort}};
Expand Down
2 changes: 1 addition & 1 deletion man/stri_enc_detect2.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
\code{\link{stri_duplicated}},
\code{\link{stri_duplicated_any}};
\code{\link{stri_extract_words}};
\code{\link{stri_locate_boundaries}};
\code{\link{stri_locate_boundaries}},
\code{\link{stri_locate_words}};
\code{\link{stri_opts_collator}};
\code{\link{stri_order}}, \code{\link{stri_sort}};
Expand Down
4 changes: 2 additions & 2 deletions man/stri_extract_words.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Other locale_sensitive: \code{\link{\%s!==\%}},
\code{\link{stri_duplicated}},
\code{\link{stri_duplicated_any}};
\code{\link{stri_enc_detect2}};
\code{\link{stri_locate_boundaries}};
\code{\link{stri_locate_boundaries}},
\code{\link{stri_locate_words}};
\code{\link{stri_opts_collator}};
\code{\link{stri_order}}, \code{\link{stri_sort}};
Expand Down Expand Up @@ -88,7 +88,7 @@ Other search_extract: \code{\link{stri_extract}},
\code{\link{stri_match_last_regex}};
\code{\link{stringi-search}}
Other text_boundaries: \code{\link{stri_locate_boundaries}};
Other text_boundaries: \code{\link{stri_locate_boundaries}},
\code{\link{stri_locate_words}};
\code{\link{stri_opts_brkiter}};
\code{\link{stri_split_boundaries}};
Expand Down
Loading

0 comments on commit 2c405a8

Please sign in to comment.