diff --git a/DESCRIPTION b/DESCRIPTION index dadf580..bfbd3c5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: RcppJagger Title: An R Wrapper for Jagger -Version: 0.0.1 +Version: 0.0.2 Authors@R: c( person("Shusei", "Eshima", email = "shuseieshima@gmail.com", role = c("aut","cre"), comment = c(ORCID = "0000-0003-3613-4046")), person("Naoki", "Yoshinaga", role = c("ctb")) @@ -21,9 +21,9 @@ Suggests: dplyr (>= 1.1.0), testthat (>= 3.1.5), tibble Config/testthat/edition: 3 LazyData: TRUE NeedsCompilation: yes -Packaged: 2023-06-04 13:28:58 UTC; shusei +Packaged: 2023-06-08 19:03:16 UTC; shusei Author: Shusei Eshima [aut, cre] (), Naoki Yoshinaga [ctb] Maintainer: Shusei Eshima Repository: CRAN -Date/Publication: 2023-06-06 07:00:06 UTC +Date/Publication: 2023-06-08 22:22:56 UTC diff --git a/MD5 b/MD5 index 5d16ab2..2745822 100644 --- a/MD5 +++ b/MD5 @@ -1,38 +1,38 @@ -e4ab7751d95cc499025f796dd5835db0 *DESCRIPTION +a6bb715255a378bab04efe6c7071c538 *DESCRIPTION 8bb48ef08475ae0cb3e0852924bd54ee *NAMESPACE -72068cd54fcf95d6eae221b46120eb8c *NEWS.md -f9f7af660472a402911b86dab482e9b2 *R/RcppExports.R +40b0fd51a66ff0f4fe0e07c08e997208 *NEWS.md +1eab0d1046382d4e44cb01d8bce7ec97 *R/RcppExports.R dc6f159d77512c9b39bb9df2a6211966 *R/RcppJagger.R 49ab70a4e1432ae366ac3f1b3ed573a8 *R/data.R -a1123c56bfe1d024bd9b4177d2a03e0f *R/lemmatizer.R -9666151174d17bfd01a2463aa09644d6 *R/pos.R -00ca98c3b4ce1f0136935b29699e3d79 *R/tokenizer.R +02a4b2b92d572134845d668ae12aaa69 *R/lemmatizer.R +ba70802ce80891d3a8977e3cb718257a *R/pos.R +3b4efa985838a5b4e3718edfe556766c *R/tokenizer.R 80fcba092843dccd645184befeb5c35f *R/util.R fa7b592dc30adfd4d804f6b365c728a3 *data/sentence_example.rda 6429b333f6879da564935edf00802f7c *inst/include/ccedar_core.h 83aa264c4503d476805cae347dfcf226 *inst/include/jagger.cc 1ff928aad2d761f6580473fda8bba1d2 *inst/include/jagger.h bb5e4ec24ecaed6c71be0d76836eedba *inst/include/mman.h -b296159a609862232c8fc2136140e08d *man/lemmatize.Rd +a5abcde7cded3808464f4e3c118fc7ef *man/lemmatize.Rd 9cfd165a7b6b42bc8928c7bc49a51ec8 *man/lemmatize_cpp_vec.Rd -329a9e0c25db5fc2b250ef78661736a4 *man/lemmatize_tbl.Rd +7c59709815d427b02ffaf6247cb6198c *man/lemmatize_tbl.Rd 9cfc71ce88c41c1194b2426fe7e0fef3 *man/pos.Rd -d41d1ec436d7777c0dd5477a6a26982f *man/pos_cpp_vec.Rd +b6d398dc9ed399dd1cf16d4eab24c1b6 *man/pos_cpp_vec.Rd 910b500dd766df96908f2908c035fa8f *man/pos_simple.Rd -698357db8eb2731dabf6f6578dfe4e88 *man/pos_simple_cpp_vec.Rd +859e55c9f42c04894903b45d5315074a *man/pos_simple_cpp_vec.Rd f026c0964eac7216fbe1650737573bae *man/sentence_example.Rd -7abe7d690f6471024dea9ee17a898ee1 *man/tokenize.Rd +1c20a20a1ac4b8a95febac0e75fc6dbe *man/tokenize.Rd a3f87da0a42e58fe8b2bb48ce563913d *man/tokenize_cpp_vec.Rd 5ad3d74d8e56c53ad324184d55cf28bf *man/tokenize_tbl.Rd -c1b4a373e6630576f922d4391312cb5b *src/RcppExports.cpp -b020b7503215ce3884f7d19e4516ab5f *src/lemmatizer.h -8fd27335b165bca5f2a1aa7f2ca99863 *src/main.cpp -a8ed2fb3d8b00a7c1c70298d904b662e *src/pos.h +01a357bad718092e0beb2a561afebdc4 *src/RcppExports.cpp +192d9de22d0dd43048cf3f60ed2e3323 *src/lemmatizer.h +f6ee5d871a44fc6fa1e007bbc7c244fb *src/main.cpp +9272de4b3ee203551d47a5cfea916a71 *src/pos.h a34ca3124494098634b0a434998a8106 *src/pos_simple.h dd9a9a7825a8e4764ea26282793a1616 *src/reader.h 2294072a6814593ae4b60c6f4d948f65 *src/tokenizer.h 1199cbae84522095d5b3c49bebbadfb9 *tests/testthat.R e77d910f914dea04b374c5759f2c9b0f *tests/testthat/test-lemmatizer.R -e5dd6c1eacfb9c5f52fd05ad18429538 *tests/testthat/test-pos.R -1ccbc80765a129be582e2d3db1e677cc *tests/testthat/test-sanity.R +9dd4dd39f70331fa5a933d328af45328 *tests/testthat/test-pos.R +7416e5a9070a9c6bfa6a59fc713cc411 *tests/testthat/test-sanity.R 91b0056a14954bcfb9d4be16bc1fa48a *tests/testthat/test-tokenizer.R diff --git a/NEWS.md b/NEWS.md index b803f30..36cd9d0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,2 +1,5 @@ +# RcppJagger 0.0.2 +* Improvement of performance and stability. + # RcppJagger 0.0.1 -* First submission to CRAN +* First submission to CRAN. diff --git a/R/RcppExports.R b/R/RcppExports.R index 7bf222b..f0edf3f 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -4,15 +4,15 @@ #' POS tagging in C++ #' #' @keywords internal -pos_cpp_vec <- function(inputs, model_path) { - .Call('_RcppJagger_pos_cpp_vec', PACKAGE = 'RcppJagger', inputs, model_path) +pos_cpp_vec <- function(inputs, model_path, keep_vec, keep_all) { + .Call('_RcppJagger_pos_cpp_vec', PACKAGE = 'RcppJagger', inputs, model_path, keep_vec, keep_all) } #' POS tagging in C++ (only token and pos) #' #' @keywords internal -pos_simple_cpp_vec <- function(inputs, model_path) { - .Call('_RcppJagger_pos_simple_cpp_vec', PACKAGE = 'RcppJagger', inputs, model_path) +pos_simple_cpp_vec <- function(inputs, model_path, keep_vec, keep_all) { + .Call('_RcppJagger_pos_simple_cpp_vec', PACKAGE = 'RcppJagger', inputs, model_path, keep_vec, keep_all) } #' Tokenizer (a vector input) diff --git a/R/lemmatizer.R b/R/lemmatizer.R index d0aa5a4..a4967d0 100644 --- a/R/lemmatizer.R +++ b/R/lemmatizer.R @@ -4,7 +4,7 @@ #' @param model_path a path to the model. #' @param keep a vector of POS(s) to keep. Default is `NULL`. #' @param concat logical. If TRUE, the function returns a concatenated string. Default is `TRUE`. -#' @return a list. +#' @return a vector (if `concat = TRUE`) or a list (if `concat = FALSE`). #' @examples #' data(sentence_example) #' res_lemmatize <- lemmatize(sentence_example$text) @@ -40,7 +40,7 @@ lemmatize <- function(input, model_path = NULL, keep = NULL, concat = TRUE) { #' @param column a column name of the tibble to tokenize. #' @param model_path a path to the model. #' @param keep a vector of POS(s) to keep. Default is `NULL`. -#' @return a vector. +#' @return a tibble. #' @examples #' data(sentence_example) #' res_lemmatize <- lemmatize_tbl(tibble::as_tibble(sentence_example), "text") diff --git a/R/pos.R b/R/pos.R index 5098e9c..8d390ab 100644 --- a/R/pos.R +++ b/R/pos.R @@ -21,19 +21,15 @@ pos <- function(input, model_path = NULL, keep = NULL, format = c("list", "data. } format <- rlang::arg_match(format) - result <- pos_cpp_vec(input, model_path) - - if (!is.null(keep)) { - result <- purrr::map(result, function(x) { - idx <- x$pos %in% keep - x$token <- x$token[idx] - x$lemma <- x$lemma[idx] - x$subtype <- x$subtype[idx] - x$pos <- x$pos[idx] - return(x) - }) + if (is.null(keep)) { + keep_all <- TRUE + keep <- c("") + } else { + keep_all <- FALSE } + result <- pos_cpp_vec(input, model_path, keep, keep_all) + if (format == "data.frame") { result <- purrr::map(result, function(x) { return(data.frame( @@ -70,17 +66,16 @@ pos_simple <- function(input, model_path = NULL, keep = NULL, format = c("list", } format <- rlang::arg_match(format) - result <- pos_simple_cpp_vec(input, model_path) - if (!is.null(keep)) { - result <- purrr::map(result, function(x) { - idx <- x$pos %in% keep - x$token <- x$token[idx] - x$pos <- x$pos[idx] - return(x) - }) + if (is.null(keep)) { + keep_all <- TRUE + keep <- c("") + } else { + keep_all <- FALSE } + result <- pos_simple_cpp_vec(input, model_path, keep, keep_all) + if (format == "data.frame") { result <- purrr::map(result, function(x) { return(data.frame( diff --git a/R/tokenizer.R b/R/tokenizer.R index 519e8af..dc30ff1 100644 --- a/R/tokenizer.R +++ b/R/tokenizer.R @@ -4,7 +4,7 @@ #' @param model_path a path to the model. #' @param keep a vector of POS(s) to keep. Default is `NULL`. #' @param concat logical. If TRUE, the function returns a concatenated string. Default is `TRUE`. -#' @return a list. +#' @return a vector (if `concat = TRUE`) or a list (if `concat = FALSE`). #' @examples #' data(sentence_example) #' res_tokenize <- tokenize(sentence_example$text) diff --git a/man/lemmatize.Rd b/man/lemmatize.Rd index 8cac87e..630fddd 100644 --- a/man/lemmatize.Rd +++ b/man/lemmatize.Rd @@ -16,7 +16,7 @@ lemmatize(input, model_path = NULL, keep = NULL, concat = TRUE) \item{concat}{logical. If TRUE, the function returns a concatenated string. Default is \code{TRUE}.} } \value{ -a list. +a vector (if \code{concat = TRUE}) or a list (if \code{concat = FALSE}). } \description{ An R wrapper for Jagger's lemmatizer diff --git a/man/lemmatize_tbl.Rd b/man/lemmatize_tbl.Rd index 4988986..4f73b21 100644 --- a/man/lemmatize_tbl.Rd +++ b/man/lemmatize_tbl.Rd @@ -16,7 +16,7 @@ lemmatize_tbl(tbl, column, model_path = NULL, keep = NULL) \item{keep}{a vector of POS(s) to keep. Default is \code{NULL}.} } \value{ -a vector. +a tibble. } \description{ An R wrapper for Jagger's lemmatizer (a tibble input) diff --git a/man/pos_cpp_vec.Rd b/man/pos_cpp_vec.Rd index 6980ad2..1d76b82 100644 --- a/man/pos_cpp_vec.Rd +++ b/man/pos_cpp_vec.Rd @@ -4,7 +4,7 @@ \alias{pos_cpp_vec} \title{POS tagging in C++} \usage{ -pos_cpp_vec(inputs, model_path) +pos_cpp_vec(inputs, model_path, keep_vec, keep_all) } \description{ POS tagging in C++ diff --git a/man/pos_simple_cpp_vec.Rd b/man/pos_simple_cpp_vec.Rd index 6351c38..fb6ca64 100644 --- a/man/pos_simple_cpp_vec.Rd +++ b/man/pos_simple_cpp_vec.Rd @@ -4,7 +4,7 @@ \alias{pos_simple_cpp_vec} \title{POS tagging in C++ (only token and pos)} \usage{ -pos_simple_cpp_vec(inputs, model_path) +pos_simple_cpp_vec(inputs, model_path, keep_vec, keep_all) } \description{ POS tagging in C++ (only token and pos) diff --git a/man/tokenize.Rd b/man/tokenize.Rd index 161b089..b2cc68d 100644 --- a/man/tokenize.Rd +++ b/man/tokenize.Rd @@ -16,7 +16,7 @@ tokenize(input, model_path = NULL, keep = NULL, concat = TRUE) \item{concat}{logical. If TRUE, the function returns a concatenated string. Default is \code{TRUE}.} } \value{ -a list. +a vector (if \code{concat = TRUE}) or a list (if \code{concat = FALSE}). } \description{ An R wrapper for Jagger's tokenizer diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 4fb5f0f..d46e546 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,26 +11,30 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // pos_cpp_vec -List pos_cpp_vec(StringVector& inputs, std::string model_path); -RcppExport SEXP _RcppJagger_pos_cpp_vec(SEXP inputsSEXP, SEXP model_pathSEXP) { +List pos_cpp_vec(StringVector& inputs, std::string model_path, StringVector& keep_vec, bool keep_all); +RcppExport SEXP _RcppJagger_pos_cpp_vec(SEXP inputsSEXP, SEXP model_pathSEXP, SEXP keep_vecSEXP, SEXP keep_allSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< StringVector& >::type inputs(inputsSEXP); Rcpp::traits::input_parameter< std::string >::type model_path(model_pathSEXP); - rcpp_result_gen = Rcpp::wrap(pos_cpp_vec(inputs, model_path)); + Rcpp::traits::input_parameter< StringVector& >::type keep_vec(keep_vecSEXP); + Rcpp::traits::input_parameter< bool >::type keep_all(keep_allSEXP); + rcpp_result_gen = Rcpp::wrap(pos_cpp_vec(inputs, model_path, keep_vec, keep_all)); return rcpp_result_gen; END_RCPP } // pos_simple_cpp_vec -List pos_simple_cpp_vec(StringVector& inputs, std::string model_path); -RcppExport SEXP _RcppJagger_pos_simple_cpp_vec(SEXP inputsSEXP, SEXP model_pathSEXP) { +List pos_simple_cpp_vec(StringVector& inputs, std::string model_path, StringVector& keep_vec, bool keep_all); +RcppExport SEXP _RcppJagger_pos_simple_cpp_vec(SEXP inputsSEXP, SEXP model_pathSEXP, SEXP keep_vecSEXP, SEXP keep_allSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< StringVector& >::type inputs(inputsSEXP); Rcpp::traits::input_parameter< std::string >::type model_path(model_pathSEXP); - rcpp_result_gen = Rcpp::wrap(pos_simple_cpp_vec(inputs, model_path)); + Rcpp::traits::input_parameter< StringVector& >::type keep_vec(keep_vecSEXP); + Rcpp::traits::input_parameter< bool >::type keep_all(keep_allSEXP); + rcpp_result_gen = Rcpp::wrap(pos_simple_cpp_vec(inputs, model_path, keep_vec, keep_all)); return rcpp_result_gen; END_RCPP } @@ -64,8 +68,8 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_RcppJagger_pos_cpp_vec", (DL_FUNC) &_RcppJagger_pos_cpp_vec, 2}, - {"_RcppJagger_pos_simple_cpp_vec", (DL_FUNC) &_RcppJagger_pos_simple_cpp_vec, 2}, + {"_RcppJagger_pos_cpp_vec", (DL_FUNC) &_RcppJagger_pos_cpp_vec, 4}, + {"_RcppJagger_pos_simple_cpp_vec", (DL_FUNC) &_RcppJagger_pos_simple_cpp_vec, 4}, {"_RcppJagger_tokenize_cpp_vec", (DL_FUNC) &_RcppJagger_tokenize_cpp_vec, 4}, {"_RcppJagger_lemmatize_cpp_vec", (DL_FUNC) &_RcppJagger_lemmatize_cpp_vec, 4}, {NULL, NULL, 0} diff --git a/src/lemmatizer.h b/src/lemmatizer.h index 80855af..c221d95 100644 --- a/src/lemmatizer.h +++ b/src/lemmatizer.h @@ -81,7 +81,7 @@ class RcppJaggerLemmatize : public jagger::tagger { // Add the final part after the last comma to `parts`. parts.emplace_back(pos_info.substr(start)); - if (parts[0] != "*" && parts.size() >= 7) { // first appearance of the token (i.e. not a concatenation) + if (parts[0] != "*" && parts.size() >= 6) { // first appearance of the token (i.e. not a concatenation) pos_vec.emplace_back(parts[0]); lemma_vec.emplace_back(parts[parts.size() - 3]); } else if (parts[0] != "*" && parts.size() == 4) { // concatenation diff --git a/src/main.cpp b/src/main.cpp index eec200d..abeecc6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,7 +18,7 @@ using namespace Rcpp; //' //' @keywords internal // [[Rcpp::export]] -List pos_cpp_vec(StringVector &inputs, std::string model_path) { +List pos_cpp_vec(StringVector &inputs, std::string model_path, StringVector &keep_vec, bool keep_all) { model_path += "/patterns"; std::string model (model_path); @@ -29,16 +29,38 @@ List pos_cpp_vec(StringVector &inputs, std::string model_path) { std::vector result; result.reserve(inputs_size); + int keep_num = keep_vec.size(); + std::unordered_set keep; + for (int i = 0; i < keep_num; i++) { + std::string keep_str = Rcpp::as< std::string >(keep_vec[i]); + keep.insert(keep_str); + } + for (int i = 0; i < inputs_size; i++) { std::vector token_vec, pos_vec, subtype_vec, lemma_vec; jagger.pos (Rcpp::as< std::string >(inputs[i]), token_vec, pos_vec, subtype_vec, lemma_vec); + std::vector kept_token_vec, kept_pos_vec, kept_subtype_vec, kept_lemma_vec; + kept_token_vec.reserve(token_vec.size()); + kept_pos_vec.reserve(pos_vec.size()); + kept_subtype_vec.reserve(subtype_vec.size()); + kept_lemma_vec.reserve(lemma_vec.size()); + + for (size_t j = 0; j < pos_vec.size(); ++j) { + if (keep_all || keep.find(pos_vec[j]) != keep.end()) { + kept_token_vec.push_back(token_vec[j]); + kept_pos_vec.push_back(pos_vec[j]); + kept_subtype_vec.push_back(subtype_vec[j]); + kept_lemma_vec.push_back(lemma_vec[j]); + } + } + List result_input; - result_input["token"] = wrap(token_vec); - result_input["pos"] = wrap(pos_vec); - result_input["subtype"] = wrap(subtype_vec); - result_input["lemma"] = wrap(lemma_vec); + result_input["token"] = wrap(kept_token_vec); + result_input["pos"] = wrap(kept_pos_vec); + result_input["subtype"] = wrap(kept_subtype_vec); + result_input["lemma"] = wrap(kept_lemma_vec); result.push_back(result_input); } @@ -51,7 +73,7 @@ List pos_cpp_vec(StringVector &inputs, std::string model_path) { //' //' @keywords internal // [[Rcpp::export]] -List pos_simple_cpp_vec(StringVector &inputs, std::string model_path) { +List pos_simple_cpp_vec(StringVector &inputs, std::string model_path, StringVector &keep_vec, bool keep_all) { model_path += "/patterns"; std::string model (model_path); @@ -62,16 +84,34 @@ List pos_simple_cpp_vec(StringVector &inputs, std::string model_path) { std::vector result; result.reserve(inputs_size); + int keep_num = keep_vec.size(); + std::unordered_set keep; + for (int i = 0; i < keep_num; i++) { + std::string keep_str = Rcpp::as< std::string >(keep_vec[i]); + keep.insert(keep_str); + } + for (int i = 0; i < inputs_size; i++) { std::vector token_vec, pos_vec; jagger.pos (Rcpp::as< std::string >(inputs[i]), token_vec, pos_vec); - List result_input_list; - result_input_list["token"] = wrap(token_vec); - result_input_list["pos"] = wrap(pos_vec); + std::vector kept_token_vec, kept_pos_vec; + kept_token_vec.reserve(token_vec.size()); + kept_pos_vec.reserve(pos_vec.size()); - result.push_back(result_input_list); + for (size_t j = 0; j < pos_vec.size(); ++j) { + if (keep_all || keep.find(pos_vec[j]) != keep.end()) { + kept_token_vec.push_back(token_vec[j]); + kept_pos_vec.push_back(pos_vec[j]); + } + } + + List result_input; + result_input["token"] = wrap(kept_token_vec); + result_input["pos"] = wrap(kept_pos_vec); + + result.push_back(result_input); } return wrap(result); @@ -89,10 +129,10 @@ StringVector tokenize_cpp_vec(StringVector &inputs, std::string model_path, Stri jagger.read_model (model); int keep_num = keep_vec.size(); - std::vector keep; - keep.reserve(keep_num); + std::unordered_set keep; for (int i = 0; i < keep_num; i++) { - keep[i] = keep_vec[i]; + std::string keep_str = Rcpp::as< std::string >(keep_vec[i]); + keep.insert(keep_str); } int inputs_size = inputs.size(); @@ -106,21 +146,11 @@ StringVector tokenize_cpp_vec(StringVector &inputs, std::string model_path, Stri std::stringstream res_str; int token_size = token_vec.size(); for (int j = 0; j < token_size; j++) { - if (keep_all) { - if (j != 0) { + if (keep_all || keep.find(pos_vec[j]) != keep.end()) { + if (!res_str.str().empty()) { res_str << " "; } res_str << token_vec[j]; - } else { - for (int k = 0; k < keep_num; k++) { - if (pos_vec[j] == keep[k]) { - if (!res_str.str().empty()) { - res_str << " "; - } - res_str << token_vec[j]; - break; - } - } } } result_vec[i] = res_str.str(); @@ -141,10 +171,10 @@ StringVector lemmatize_cpp_vec(StringVector &inputs, std::string model_path, Str jagger.read_model (model); int keep_num = keep_vec.size(); - std::vector keep; - keep.reserve(keep_num); + std::unordered_set keep; for (int i = 0; i < keep_num; i++) { - keep[i] = keep_vec[i]; + std::string keep_str = Rcpp::as< std::string >(keep_vec[i]); + keep.insert(keep_str); } int inputs_size = inputs.size(); @@ -158,21 +188,11 @@ StringVector lemmatize_cpp_vec(StringVector &inputs, std::string model_path, Str std::stringstream res_str; int lemma_size = lemma_vec.size(); for (int j = 0; j < lemma_size; j++) { - if (keep_all) { - if (j != 0) { + if (keep_all || keep.find(pos_vec[j]) != keep.end()) { + if (!res_str.str().empty()) { res_str << " "; } res_str << lemma_vec[j]; - } else { - for (int k = 0; k < keep_num; k++) { - if (pos_vec[j] == keep[k]) { - if (!res_str.str().empty()) { - res_str << " "; - } - res_str << lemma_vec[j]; - break; - } - } } } result_vec[i] = res_str.str(); diff --git a/src/pos.h b/src/pos.h index e35cbdc..f59cab8 100644 --- a/src/pos.h +++ b/src/pos.h @@ -85,7 +85,8 @@ class RcppJaggerPOS : public jagger::tagger { parts.emplace_back(pos_info.substr(start)); // Add thee first part to `pos_vec` and the third-last part to `lemma_vec`. - if (parts[0] != "*" && parts.size() >= 7) { // first appearance of the token (i.e. not a concatenation) + // Rcout << pos_info << " " << parts.size() << "\n"; + if (parts[0] != "*" && parts.size() >= 6) { // first appearance of the token (i.e. not a concatenation) (Some POS have `*.*`, e.g. "零時五十分予鈴。") pos_vec.emplace_back(parts[0]); subtype_vec.emplace_back(parts[1]); lemma_vec.emplace_back(parts[parts.size() - 3]); diff --git a/tests/testthat/test-pos.R b/tests/testthat/test-pos.R index 83d9c46..464670c 100644 --- a/tests/testthat/test-pos.R +++ b/tests/testthat/test-pos.R @@ -1,6 +1,6 @@ test_that("pos", { skip_on_cran(); skip_on_os(c("windows", "linux", "sloaris")) - sentence <- "日本語の文章の形態素解析を実験しています。\nこれが百二十五文目です。" + sentence <- "日本語の文章の形態素解析を実験しています。\nこれが百二十五文目です。零時五十分予鈴。" expect_no_error(pos(sentence)) expect_no_error(pos(sentence, format = "data.frame")) expect_error(pos(sentence, format = "tibble")) diff --git a/tests/testthat/test-sanity.R b/tests/testthat/test-sanity.R index a2c771c..e2beeba 100644 --- a/tests/testthat/test-sanity.R +++ b/tests/testthat/test-sanity.R @@ -2,11 +2,14 @@ test_that("pos-pos_simple", { skip_on_cran(); skip_on_os(c("windows", "linux", "sloaris")) - sentence <- "日本語の文章の形態素解析を実験しています。\nこれが百二十五文目です。" + sentence <- "日本語の文章の形態素解析を実験しています。\nこれが百二十五文目です。零時五十分予鈴。" sentences <- c(sentence, "2つ目の文章を追加します。") expect_identical(pos(sentence)[[1]]$token, pos_simple(sentence)[[1]]$token) expect_identical(pos(sentences)[[2]]$token, pos_simple(sentences)[[2]]$token) + expect_identical(pos(sentences, format = "data.frame")[[1]]$token, pos_simple(sentences, format = "data.frame")[[1]]$token) + expect_identical(pos(sentences, format = "data.frame", keep = c("動詞", "名詞"))[[1]]$token, pos_simple(sentences, format = "data.frame", keep = c("動詞", "名詞"))[[1]]$token) + expect_identical(pos(sentences, format = "data.frame")[[2]]$token, pos_simple(sentences, format = "data.frame")[[2]]$token) expect_identical(pos(sentence)[[1]]$pos, pos_simple(sentence)[[1]]$pos) expect_identical(pos(sentences)[[2]]$pos, pos_simple(sentences)[[2]]$pos) @@ -14,16 +17,17 @@ test_that("pos-pos_simple", { test_that("pos-lemmatize", { skip_on_cran(); skip_on_os(c("windows", "linux", "sloaris")) - sentence <- "日本語の文章の形態素解析を実験しています。\nこれが百二十五文目です。" + sentence <- "日本語の文章の形態素解析を実験しています。\nこれが百二十五文目です。零時五十分予鈴。" sentences <- c(sentence, "2つ目の文章を追加します。") expect_identical(pos(sentence)[[1]]$lemma, lemmatize(sentence, concat = FALSE)[[1]]) + expect_identical(pos(sentence, keep = c("動詞", "名詞"))[[1]]$lemma, lemmatize(sentence, , keep = c("動詞", "名詞"), concat = FALSE)[[1]]) expect_identical(pos(sentences)[[2]]$lemma, lemmatize(sentences, concat = FALSE)[[2]]) }) test_that("pos-tokenize", { skip_on_cran(); skip_on_os(c("windows", "linux", "sloaris")) - sentence <- "日本語の文章の形態素解析を実験しています。\nこれが百二十五文目です。" + sentence <- "日本語の文章の形態素解析を実験しています。\nこれが百二十五文目です。零時五十分予鈴。" sentences <- c(sentence, "12の文章を追加します。") expect_identical(pos(sentence)[[1]]$token, tokenize(sentence, concat = FALSE)[[1]])