Skip to content

Commit

Permalink
#23: overlap in stri_opts_fixed - overlapping pattern matches in cert…
Browse files Browse the repository at this point in the history
…ain search functions
  • Loading branch information
gagolews committed Dec 8, 2014
1 parent 5fe1571 commit efccab3
Show file tree
Hide file tree
Showing 15 changed files with 59 additions and 26 deletions.
6 changes: 4 additions & 2 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@

* [BUGFIX] #94: t.b.d. Solaris!!!

* [NEW FEATURE] #23: [[[not sure if for 0.4-1]]] -- QUESTION [overlapping patterns in fixed search]

* [NEW FEATURE] #113: [[[not sure if for 0.4-1]]] -- QUESTION [brkiter in coll/fixed search]

(...)
Expand All @@ -40,6 +38,10 @@
processing) yet very fast `case_insensitive` pattern matching can be
performed now. `stri_extract_*_fixed` is again available.

* [NEW FEATURE] #23: `stri_extract_all_fixed`, `stri_count`, and
`stri_locate_all_fixed` may now also look for overlapping pattern
matches, see `?stri_opts_fixed`.

* [NEW FEATURE] #129: `stri_match_*_regex` gained a `cg_missing` argument.

* [NEW FEATURE] #117: `stri_extract_all_*()`, `stri_locate_all_*()`,
Expand Down
10 changes: 8 additions & 2 deletions R/opts.R
Original file line number Diff line number Diff line change
Expand Up @@ -289,9 +289,14 @@ stri_opts_brkiter <- function(type, locale, skip_word_none,
#' Full case mappings should be used whenever possible because they produce
#' better results by working on whole strings. They take into account
#' the string context and the language and can map to a result string with
#' a different length as appropriate, see \link{stringi-search-coll}.
#' a different length as appropriate, see \link{stringi-search-coll}.
#'
#' Searching for overlapping pattern matches works in case of the
#' \code{\link{stri_extract_all_fixed}}, \code{\link{stri_locate_all_fixed}},
#' and \code{\link{stri_count}} functions.
#'
#' @param case_insensitive logical; enable simple case insensitive matching
#' @param overlap logical; enable overlapping matches detection in certain functions
#' @param ... any other arguments to this function are purposedly ignored
#'
#' @return
Expand All @@ -308,10 +313,11 @@ stri_opts_brkiter <- function(type, locale, skip_word_none,
#' stri_detect_fixed("ala", "ALA") # case-sensitive by default
#' stri_detect_fixed("ala", "ALA", opts_fixed=stri_opts_fixed(case_insensitive=TRUE))
#' stri_detect_fixed("ala", "ALA", case_insensitive=TRUE) # equivalent
stri_opts_fixed <- function(case_insensitive, ...)
stri_opts_fixed <- function(case_insensitive, overlap, ...)
{
opts <- list()
if (!missing(case_insensitive)) opts["case_insensitive"] <- case_insensitive
if (!missing(overlap)) opts["overlap"] <- overlap
opts
}

3 changes: 2 additions & 1 deletion R/search_extract_4.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@
#' stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+'), simplify=TRUE)
#' stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+'), simplify=NA)
#'
#' stri_extract_all_fixed('abaBAba', "aba", case_insensitive=TRUE)
#' stri_extract_all_fixed("abaBAba", "Aba", case_insensitive=TRUE)
#' stri_extract_all_fixed("abaBAba", "Aba", case_insensitive=TRUE, overlap=TRUE)
#'
#' @family search_extract
#'
Expand Down
1 change: 1 addition & 0 deletions R/search_locate_4.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@
#' stri_locate_last_coll("\ufdfa\ufdfa\ufdfaXYZ", pat, strength = 1)
#'
#' stri_locate_all_fixed(c('AaaaaaaA', 'AAAA'), 'a')
#' stri_locate_all_fixed(c('AaaaaaaA', 'AAAA'), 'a', case_insensitive=TRUE, overlap=TRUE)
#' stri_locate_first_fixed(c('AaaaaaaA', 'aaa', 'AAA'), 'a')
#' stri_locate_last_fixed(c('AaaaaaaA', 'aaa', 'AAA'), 'a')
#'
Expand Down
15 changes: 8 additions & 7 deletions devel/testthat/test-count-fixed.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ test_that("stri_count_fixed", {
expect_equivalent(stri_count_fixed("bbbbba", "BBBBB", case_insensitive=TRUE), 1L)

for (p in stri_sub("abcdefghij", 1, 1:6)) {
for (i in 1:5) {
for (i in 0:5) {
for (val in c(TRUE, FALSE)) {
expect_equivalent(stri_count_fixed(stri_dup(p, i), p, case_insensitive=val), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "y", case_insensitive=val), i), p), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "yy", case_insensitive=val), i), p), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "yyy", case_insensitive=val), i), p), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "yyyy", case_insensitive=val), i), p), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "yyyyy", case_insensitive=val), i), p), i)
expect_equivalent(stri_count_fixed(stri_dup(p, i), p, case_insensitive=val, overlap=val), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "y", case_insensitive=val, overlap=val), i), p), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "yy", case_insensitive=val, overlap=val), i), p), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "yyy", case_insensitive=val, overlap=val), i), p), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "yyyy", case_insensitive=val, overlap=val), i), p), i)
expect_equivalent(stri_count_fixed(stri_dup(stri_c("x", p, "yyyyy", case_insensitive=val, overlap=val), i), p), i)
}
}
}
Expand Down Expand Up @@ -77,4 +77,5 @@ massa nibh nec erat."
expect_identical(stri_count_fixed(s,"bab"),c(5L,0L))
expect_identical(stri_count_fixed(c("lalal","12l34l56","\u0105\u0f3l\u0142"),"l"),3:1)

expect_equivalent(stri_count_fixed(c('AaaaaaaA', 'AAAA'), 'a', case_insensitive=TRUE, overlap=TRUE), c(8, 4))
})
2 changes: 2 additions & 0 deletions devel/testthat/test-extract-fixed.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ test_that("stri_extract_all_fixed", {

expect_identical(stri_extract(c("a_a", "a_a_a", "a", ""), mode='all', fixed="a", simplify=TRUE, omit_no_match=TRUE),
matrix(c("a", "a", "a", "", "a", "a", "", "", "", "a", "", ""), nrow=4))

expect_identical(stri_extract_all_fixed("abaBAba", "Aba", case_insensitive=TRUE, overlap=TRUE), list(c("aba", "aBA", "Aba")))

# expect_identical(stri_extract_all_fixed(c("ababab", NA, "ab", "ba"), "ab"),
# str_extract_all(c("ababab", NA, "ab", "ba"), "ab"))
Expand Down
2 changes: 2 additions & 0 deletions devel/testthat/test-locate-fixed.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ test_that("stri_locate_all_fixed", {
expect_equivalent(as.integer(stri_locate_all_fixed("?", "[a-z]", omit_no_match = TRUE)[[1]]),
integer(0))

expect_equivalent(stri_locate_all_fixed("\u0105\u0105\u0105", "\u0105\u0105", overlap=TRUE), list(matrix(c(1,2, 2, 3), byrow=TRUE,ncol=2)))
expect_equivalent(stri_locate_all_fixed("1a\u0105a", "\u0105"), list(matrix(c(3,3))))
expect_equivalent(stri_locate_all_fixed("aaa", "aa"), list(matrix(c(1,2))))
expect_equivalent(stri_locate_all_fixed("aaa", "a"), list(matrix(rep(1:3,2),ncol=2)))
Expand Down Expand Up @@ -81,6 +82,7 @@ test_that("stri_locate_first_fixed", {
expect_equivalent(stri_locate_first_fixed("1-1-2-33--2", "-32", case_insensitive=val), matrix(c(NA_integer_,NA_integer_)))
}

expect_warning(expect_equivalent(stri_locate_first_fixed("\u0105a", "\u0105a", overlap=TRUE), matrix(c(1,2))))
expect_equivalent(stri_locate_first_fixed("\u0105a", "\u0105a"), matrix(c(1,2)))
expect_equivalent(stri_locate_first_fixed(stri_trans_nfkd("\u0105a"), "\u0105a"), matrix(c(NA_integer_,NA)))
expect_equivalent(stri_locate_first_fixed("\U0001F0A0a", "a"), matrix(c(2,2)))
Expand Down
3 changes: 2 additions & 1 deletion man/stri_extract.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ stri_list2matrix(stri_extract_all_regex('XaaaaX', c('\\\\p{Ll}', '\\\\p{Ll}+')))
stri_extract_all_regex('XaaaaX', c('\\\\p{Ll}', '\\\\p{Ll}+'), simplify=TRUE)
stri_extract_all_regex('XaaaaX', c('\\\\p{Ll}', '\\\\p{Ll}+'), simplify=NA)

stri_extract_all_fixed('abaBAba', "aba", case_insensitive=TRUE)
stri_extract_all_fixed("abaBAba", "Aba", case_insensitive=TRUE)
stri_extract_all_fixed("abaBAba", "Aba", case_insensitive=TRUE, overlap=TRUE)
}
\seealso{
Other search_extract: \code{\link{stri_extract_all_words}},
Expand Down
1 change: 1 addition & 0 deletions man/stri_locate.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ pat <- stri_paste("\\u0635\\u0644\\u0649 \\u0627\\u0644\\u0644\\u0647 ",
stri_locate_last_coll("\\ufdfa\\ufdfa\\ufdfaXYZ", pat, strength = 1)
stri_locate_all_fixed(c('AaaaaaaA', 'AAAA'), 'a')
stri_locate_all_fixed(c('AaaaaaaA', 'AAAA'), 'a', case_insensitive=TRUE, overlap=TRUE)
stri_locate_first_fixed(c('AaaaaaaA', 'aaa', 'AAA'), 'a')
stri_locate_last_fixed(c('AaaaaaaA', 'aaa', 'AAA'), 'a')
Expand Down
10 changes: 8 additions & 2 deletions man/stri_opts_fixed.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
\alias{stri_opts_fixed}
\title{Generate a List with Fixed Pattern Search Engine's Settings}
\usage{
stri_opts_fixed(case_insensitive, ...)
stri_opts_fixed(case_insensitive, overlap, ...)
}
\arguments{
\item{case_insensitive}{logical; enable simple case insensitive matching}
\item{overlap}{logical; enable overlapping matches detection in certain functions}
\item{...}{any other arguments to this function are purposedly ignored}
}
\value{
Expand All @@ -23,7 +25,11 @@ Case-insensitive matching uses a simple, single-code point case mapping
Full case mappings should be used whenever possible because they produce
better results by working on whole strings. They take into account
the string context and the language and can map to a result string with
a different length as appropriate, see \link{stringi-search-coll}.
a different length as appropriate, see \link{stringi-search-coll}.
Searching for overlapping pattern matches works in case of the
\code{\link{stri_extract_all_fixed}}, \code{\link{stri_locate_all_fixed}},
and \code{\link{stri_count}} functions.
}
\examples{
stri_detect_fixed("ala", "ALA") # case-sensitive by default
Expand Down
9 changes: 5 additions & 4 deletions src/stri_container_bytesearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -804,11 +804,12 @@ R_len_t StriContainerByteSearch::findFromPosBack_KMP(R_len_t startPos)
* may call Rf_error
*
* @param opts_fixed list
* @param allow_overlap
* @return flags
*
* @version 0.4-1 (Marek Gagolewski, 2014-12-07)
*/
uint32_t StriContainerByteSearch::getByteSearchFlags(SEXP opts_fixed)
uint32_t StriContainerByteSearch::getByteSearchFlags(SEXP opts_fixed, bool allow_overlap)
{
uint32_t flags = 0;
if (!isNull(opts_fixed) && !Rf_isVectorList(opts_fixed))
Expand All @@ -831,9 +832,9 @@ uint32_t StriContainerByteSearch::getByteSearchFlags(SEXP opts_fixed)
if (!strcmp(curname, "case_insensitive")) {
bool val = stri__prepare_arg_logical_1_notNA(VECTOR_ELT(opts_fixed, i), "case_insensitive");
if (val) flags |= BYTESEARCH_CASE_INSENSITIVE;
// } else if (!strcmp(curname, "overlap??")) {
// bool val = stri__prepare_arg_logical_1_notNA(VECTOR_ELT(opts_fixed, i), "overlap??");
// if (val) flags |= FIXED_?????;
} else if (!strcmp(curname, "overlap") && allow_overlap) {
bool val = stri__prepare_arg_logical_1_notNA(VECTOR_ELT(opts_fixed, i), "overlap");
if (val) flags |= BYTESEARCH_OVERLAP;
} else {
Rf_warning(MSG__INCORRECT_FIXED_OPTION, curname);
}
Expand Down
17 changes: 13 additions & 4 deletions src/stri_container_bytesearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ class StriContainerByteSearch : public StriContainerUTF8 {
private:

typedef enum ByteSearchFlag {
BYTESEARCH_CASE_INSENSITIVE = 2
BYTESEARCH_CASE_INSENSITIVE = 2,
BYTESEARCH_OVERLAP = 4
} ByteSearchFlag;

R_len_t searchPos; // -1 after reset, searchLen on no further matches
Expand Down Expand Up @@ -106,7 +107,7 @@ class StriContainerByteSearch : public StriContainerUTF8 {

public:

static uint32_t getByteSearchFlags(SEXP opts_fixed);
static uint32_t getByteSearchFlags(SEXP opts_fixed, bool allow_overlap=false);

StriContainerByteSearch();
StriContainerByteSearch(SEXP rstr, R_len_t nrecycle, uint32_t flags);
Expand Down Expand Up @@ -185,13 +186,21 @@ class StriContainerByteSearch : public StriContainerUTF8 {
#endif

if (searchPos < 0) return findFirst();

int pos;
if (flags&BYTESEARCH_OVERLAP) {
pos = searchPos;
U8_FWD_1(searchStr, pos, searchLen);
}
else
pos = searchEnd;

#ifndef STRI__BYTESEARCH_DISABLE_SHORTPAT
if (!(flags&BYTESEARCH_CASE_INSENSITIVE) && patternLen <= 4)
return findFromPosFwd_short(searchEnd);
return findFromPosFwd_short(pos);
#endif

return findFromPosFwd_KMP(searchEnd);
return findFromPosFwd_KMP(pos);
}


Expand Down
2 changes: 1 addition & 1 deletion src/stri_search_fixed_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
*/
SEXP stri_count_fixed(SEXP str, SEXP pattern, SEXP opts_fixed)
{
uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed);
uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed, /*allow_overlap*/true);
PROTECT(str = stri_prepare_arg_string(str, "str"));
PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));

Expand Down
2 changes: 1 addition & 1 deletion src/stri_search_fixed_extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ SEXP stri_extract_last_fixed(SEXP str, SEXP pattern, SEXP opts_fixed)
*/
SEXP stri_extract_all_fixed(SEXP str, SEXP pattern, SEXP simplify, SEXP omit_no_match, SEXP opts_fixed)
{
uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed);
uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed, /*allow_overlap*/true);
bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match");
PROTECT(simplify = stri_prepare_arg_logical_1(simplify, "simplify"));
PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument
Expand Down
2 changes: 1 addition & 1 deletion src/stri_search_fixed_locate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ SEXP stri_locate_last_fixed(SEXP str, SEXP pattern, SEXP opts_fixed)
*/
SEXP stri_locate_all_fixed(SEXP str, SEXP pattern, SEXP omit_no_match, SEXP opts_fixed)
{
uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed);
uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed, /*allow_overlap*/true);
bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match");
PROTECT(str = stri_prepare_arg_string(str, "str"));
PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
Expand Down

0 comments on commit efccab3

Please sign in to comment.