/
lemmatizer.R
71 lines (66 loc) · 1.98 KB
/
lemmatizer.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#' An R wrapper for Jagger's lemmatizer
#'
#' @param input an input.
#' @param model_path a path to the model.
#' @param keep a vector of POS(s) to keep. Default is `NULL`.
#' @param concat logical. If TRUE, the function returns a concatenated string. Default is `TRUE`.
#' @return a vector (if `concat = TRUE`) or a list (if `concat = FALSE`).
#' @examples
#' data(sentence_example)
#' res_lemmatize <- lemmatize(sentence_example$text)
#' @export
lemmatize <- function(input, model_path = NULL, keep = NULL, concat = TRUE) {
if (is.null(model_path)) {
model_path <- get_model_path()
}
if (!exists_model_path(model_path)) {
return(input)
}
if (! "character" %in% class(input)) {
cli::cli_abort("Please provide a character vector.")
}
if (is.null(keep)) {
keep_all <- TRUE
keep <- c("")
} else {
keep_all <- FALSE
}
lemmatized <- lemmatize_cpp_vec(input, model_path, keep, keep_all)
if (!concat) {
lemmatized <- strsplit(lemmatized, " ")
}
return(lemmatized)
}
#' An R wrapper for Jagger's lemmatizer (a tibble input)
#'
#' @param tbl a tibble object.
#' @param column a column name of the tibble to tokenize.
#' @param model_path a path to the model.
#' @param keep a vector of POS(s) to keep. Default is `NULL`.
#' @return a tibble.
#' @examples
#' data(sentence_example)
#' res_lemmatize <- lemmatize_tbl(tibble::as_tibble(sentence_example), "text")
#' @export
lemmatize_tbl <- function(tbl, column, model_path = NULL, keep = NULL) {
if (is.null(model_path)) {
model_path <- get_model_path()
}
if (!exists_model_path(model_path)) {
return(tbl)
}
if (! "tbl_df" %in% class(tbl)) {
cli::cli_abort("Please provide a tibble object.")
}
if (! column %in% names(tbl)) {
cli::cli_abort("Please provide a column name of the tibble.")
}
if (is.null(keep)) {
keep_all <- TRUE
keep <- c("")
} else {
keep_all <- FALSE
}
tbl$lemmatized <- lemmatize_cpp_vec(tbl[[column]], model_path, keep, keep_all)
return(tbl)
}