Skip to content

Commit

Permalink
#105 done: new fun: stri_list2matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
gagolews committed Oct 23, 2014
1 parent 035ed4e commit 7ca7ecd
Show file tree
Hide file tree
Showing 12 changed files with 270 additions and 2 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ export(stri_install_icudt)
export(stri_isempty)
export(stri_join)
export(stri_length)
export(stri_list2matrix)
export(stri_locale_get)
export(stri_locale_info)
export(stri_locale_list)
Expand Down
4 changes: 4 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
`stri_split_regex`, `stri_split_coll` gained a `tokens_only` parameter,
which defaults to `FALSE` for backward compatibility.

* [NEW FUNCTION] #105: `stri_list2matrix` converts lists of atomic vectors
to character matrices, useful in connection with `stri_split`
and `stri_extract`.

* ....[NEW FUNCTIONS] #41: `stri_startswith_*` and `stri_endswith_*`
determine whether a string starts or ends with a given pattern.
[TO DO.........: documentation, coll, charclass, regex?, other]
Expand Down
4 changes: 4 additions & 0 deletions R/search_extract_4.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@
#' represents the results of a separate search scenario.
#' If a pattern is not found, then a character vector of length 1,
#' with single \code{NA} value will be generated.
#' If you do not like playing with lists, consider calling
#' \code{\link{stri_list2matrix}} on the resulting object.
#'
#' \code{stri_extract_first*} and \code{stri_extract_last*},
#' on the other hand, return a character vector.
Expand All @@ -101,6 +103,8 @@
#' stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?'))
#' stri_extract_first_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?'))
#' stri_extract_last_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?'))
#'
#' stri_list2matrix(stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+')))
#' }
#'
#' @family search_extract
Expand Down
5 changes: 4 additions & 1 deletion R/search_split_4.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@
#' \code{stri_split} only
#'
#' @return All the functions return a list of character vectors.
#'
#' If you do not like playing with lists, consider calling
#' \code{\link{stri_list2matrix}} on the resulting object.
#'
#' @examples
#' \donttest{
Expand All @@ -99,6 +100,8 @@
#' stri_split_fixed(c("ab_c", "d_ef_g", "h", ""), "_", n_max=2, tokens_only=TRUE, omit_empty=TRUE)
#' stri_split_fixed(c("ab_c", "d_ef_g", "h", ""), "_", n_max=3, tokens_only=TRUE, omit_empty=TRUE)
#'
#' stri_list2matrix(stri_split_fixed(c("ab_c", "d_ef_g", "h", ""), "_", omit_empty=TRUE))
#'
#' stri_split_charclass("Lorem ipsum dolor sit amet", "\\p{WHITE_SPACE}")
#' stri_split_charclass(" Lorem ipsum dolor", "\\p{WHITE_SPACE}", n_max=3,
#' omit_empty=c(FALSE, TRUE))
Expand Down
80 changes: 80 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2014, Marek Gagolewski and Bartek Tartanus
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#' @title
#' Convert a List to a Character Matrix
#'
#' @description
#' This function converts a given list of atomic vectors to
#' a character matrix.
#'
#' @details
#' This function is similar to the builtin \code{\link{simplify2array}}
#' function. However, it always returns a character matrix,
#' even if each element in \code{x} is of length 1
#' or if elements in \code{x} are not of the same lenghts.
#' Moreover, the elements in \code{x} are always coerced to character vectors.
#'
#' If \code{byrow} is \code{FALSE}, then a matrix with \code{length(x)}
#' columns is returned. The number of rows is the lenght of the
#' longest vector in \code{x}. Basically, we have
#' \code{result[i,j] == x[[j]][i]} if \code{i <= length(x[[j]])}
#' and \code{result[i,j] == fill} otherwise, see Examples.
#'
#' If \code{byrow} is \code{TRUE}, then the resulting matrix is
#' a transposition of the above-described one.
#'
#' This function may be useful e.g. in connection with \code{\link{stri_split}}.
#'
#' @param x a list of atomic vectors
#' @param byrow single logical value; should the resulting matrix be
#' transposed?
#' @param fill single string, see Details
#'
#' @return
#' Always returns a character matrix.
#'
#' @examples
#' \donttest{
#' simplify2array(list(c("a", "b"), c("c", "d"), c("e", "f")))
#' stri_list2matrix(list(c("a", "b"), c("c", "d"), c("e", "f")))
#' stri_list2matrix(list(c("a", "b"), c("c", "d"), c("e", "f")), byrow=TRUE)
#'
#' simplify2array(list("a", c("b", "c")))
#' stri_list2matrix(list("a", c("b", "c")))
#' stri_list2matrix(list("a", c("b", "c")), fill="")
#' }
#'
#' @family utils
#' @export
stri_list2matrix <- function(x, byrow=FALSE, fill=NA_character_) {
.Call("stri_list2matrix", x, byrow, stri_enc_toutf8(fill), PACKAGE="stringi")
}
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ platform-independent set of functions known to *Java*, *Perl*, *Python*,
Hadley Wickham's [stringr](http://cran.r-project.org/web/packages/stringr)
package.

**Package maintainer**: [Marek Gagolewski](http://gagolewski.rexamine.com/)

**Authors**: [Marek Gagolewski](http://gagolewski.rexamine.com/),
[Bartlomiej Tartanus](http://tartanus.rexamine.com/)

Expand All @@ -37,7 +39,7 @@ For more details on copyright holders see the `LICENSE` file.
> [ICU license](http://source.icu-project.org/repos/icu/icu/trunk/license.html),
> a simple, permissive non-copyleft free software license, compatible with
> the GNU GPL. The *ICU* license is
> [intended](http://userguide.icu-project.org/icufaq#TOC-How-is-the-ICU-licensed)
> [intended](http://userguide.icu-project.org/icufaq#TOC-How-is-the-ICU-licensed-)
> to allow *ICU* to be included both in free software projects
> and in proprietary or commercial products.
Expand Down
20 changes: 20 additions & 0 deletions devel/testthat/test-list2matrix.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
require(testthat)

test_that("stri_list2matrix", {

expect_identical(stri_list2matrix(list(c("a", "b"), c("c", "d"), c("e", "f"))),
simplify2array(list(c("a", "b"), c("c", "d"), c("e", "f"))))

expect_identical(stri_list2matrix(list(c("a", "b"), c("c", "d"), c("e", "f")), byrow=TRUE),
t(simplify2array(list(c("a", "b"), c("c", "d"), c("e", "f")))))

expect_identical(stri_list2matrix(list("a", c("b", "c"))),
matrix(c("a", NA, "b", "c"), ncol=2))

expect_identical(stri_list2matrix(list("a", c("b", "c")), fill=""),
matrix(c("a", "", "b", "c"), ncol=2))

expect_identical(stri_list2matrix(list()), structure(character(0), dim=c(0,0)))
expect_identical(stri_list2matrix(list(character(0), character(0))), structure(character(0), dim=c(0,2)))
expect_error(stri_list2matrix(list(LETTERS, mean, letters)))
})
52 changes: 52 additions & 0 deletions man/stri_list2matrix.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
% Generated by roxygen2 (4.0.2): do not edit by hand
\name{stri_list2matrix}
\alias{stri_list2matrix}
\title{Convert a List to a Character Matrix}
\usage{
stri_list2matrix(x, byrow = FALSE, fill = NA_character_)
}
\arguments{
\item{x}{a list of atomic vectors}

\item{byrow}{single logical value; should the resulting matrix be
transposed?}

\item{fill}{single string, see Details}
}
\value{
Always returns a character matrix.
}
\description{
This function converts a given list of atomic vectors to
a character matrix.
}
\details{
This function is similar to the builtin \code{\link{simplify2array}}
function. However, it always returns a character matrix,
even if each element in \code{x} is of length 1
or if elements in \code{x} are not of the same lenghts.
Moreover, the elements in \code{x} are always coerced to character vectors.

If \code{byrow} is \code{FALSE}, then a matrix with \code{length(x)}
columns is returned. The number of rows is the lenght of the
longest vector in \code{x}. Basically, we have
\code{result[i,j] == x[[j]][i]} if \code{i <= length(x[[j]])}
and \code{result[i,j] == fill} otherwise, see Examples.

If \code{byrow} is \code{TRUE}, then the resulting matrix is
a transposition of the above-described one.

This function may be useful e.g. in connection with \code{\link{stri_split}}.
}
\examples{
\donttest{
simplify2array(list(c("a", "b"), c("c", "d"), c("e", "f")))
stri_list2matrix(list(c("a", "b"), c("c", "d"), c("e", "f")))
stri_list2matrix(list(c("a", "b"), c("c", "d"), c("e", "f")), byrow=TRUE)

simplify2array(list("a", c("b", "c")))
stri_list2matrix(list("a", c("b", "c")))
stri_list2matrix(list("a", c("b", "c")), fill="")
}
}

1 change: 1 addition & 0 deletions src/stri_cpp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,5 @@ stri_trans_transliterate.cpp \
stri_uchar.cpp \
stri_ucnv.cpp \
stri_uloc.cpp \
stri_utils.cpp \
stri_wrap.cpp
1 change: 1 addition & 0 deletions src/stri_stringi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ static const R_CallMethodDef cCallMethods[] = {
STRI__MAKE_CALL_METHOD(stri_join2_nocollapse, 2),
// STRI__MAKE_CALL_METHOD(stri_justify, 2), // TODO: version >= 0.2
STRI__MAKE_CALL_METHOD(stri_length, 1),
STRI__MAKE_CALL_METHOD(stri_list2matrix, 3),
STRI__MAKE_CALL_METHOD(stri_locale_info, 1),
STRI__MAKE_CALL_METHOD(stri_locale_list, 0),
STRI__MAKE_CALL_METHOD(stri_locale_set, 1),
Expand Down
4 changes: 4 additions & 0 deletions src/stri_stringi.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ SEXP stri_test_UnicodeContainer16(SEXP str);
SEXP stri_test_UnicodeContainer8(SEXP str);
SEXP stri_test_returnasis(SEXP x);


// utils.cpp
SEXP stri_list2matrix(SEXP x, SEXP byrow, SEXP fill);

// ------------------------------------------------------------------------


Expand Down
96 changes: 96 additions & 0 deletions src/stri_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/* This file is part of the 'stringi' package for R.
* Copyright (c) 2013-2014, Marek Gagolewski and Bartek Tartanus
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/


#include "stri_stringi.h"
#include "stri_container_utf8.h"
#include "stri_container_listutf8.h"


/**
* Convert list to a character matrix
*
* @param x a list
* @param fill single string
* @param byrow single logical value
* @return character matrix
*
* @version 0.3-1 (Marek Gagolewski, 2014-10-23)
*/
SEXP stri_list2matrix(SEXP x, SEXP byrow, SEXP fill)
{
x = stri_prepare_arg_list_string(x, "x");
bool byrow2 = stri__prepare_arg_logical_1_notNA(byrow, "byrow");
fill = stri_prepare_arg_string_1(fill, "fill"); // enc2utf8 called in R

STRI__ERROR_HANDLER_BEGIN
R_len_t n = LENGTH(x);
SEXP fill2 = STRING_ELT(fill, 0);

R_len_t m = 0; // maximal vector length
for (int i=0; i<n; ++i) {
R_len_t k = LENGTH(VECTOR_ELT(x, i));
if (k > m) m = k;
}

SEXP ret;
if (!byrow2) {
STRI__PROTECT(ret = Rf_allocMatrix(STRSXP, m, n));
int ret_idx = 0;
for (int i=0; i<n; ++i) {
SEXP cur_str = VECTOR_ELT(x, i);
R_len_t cur_len = LENGTH(cur_str);
int j;
for (j=0; j<cur_len; ++j)
SET_STRING_ELT(ret, ret_idx++, STRING_ELT(cur_str, j));
for (; j<m; ++j)
SET_STRING_ELT(ret, ret_idx++, fill2);
}
}
else {
STRI__PROTECT(ret = Rf_allocMatrix(STRSXP, n, m));
for (int i=0; i<n; ++i) {
SEXP cur_str = VECTOR_ELT(x, i);
R_len_t cur_len = LENGTH(cur_str);
int j;
for (j=0; j<cur_len; ++j)
SET_STRING_ELT(ret, i+j*n, STRING_ELT(cur_str, j));
for (; j<m; ++j)
SET_STRING_ELT(ret, i+j*n, fill2);
}
}

STRI__UNPROTECT_ALL
return ret;

STRI__ERROR_HANDLER_END({/* no-op on err */})
}

0 comments on commit 7ca7ecd

Please sign in to comment.