gesistsa · chainsawriot · Mar 20, 2024 · Mar 20, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -8,7 +8,7 @@ Description: Duct tape the 'quanteda' ecosystem (Benoit et al., 2018) <doi:10.21
 License: GPL (>= 3)
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 URL: https://github.com/chainsawriot/grafzahl
 BugReports: https://github.com/chainsawriot/grafzahl/issues
 Suggests: 

diff --git a/R/misc.R b/R/misc.R
@@ -20,6 +20,12 @@ NULL
 #' Van Atteveldt, W., Van der Velden, M. A., & Boukes, M. (2021). The validity of sentiment analysis: Comparing manual annotation, crowd-coding, dictionary approaches, and machine learning algorithms. Communication Methods and Measures, 15(2), 121-140.
 "ecosent"
 
+#' Supported model types
+#'
+#' A vector of all supported model types.
+#' 
+"supported_model_types"
+
 #' Download The Amharic News Text Classification Dataset
 #'
 #' This function downloads the training and test sets of the Amharic News Text Classification Dataset from Hugging Face.

diff --git a/R/train.R b/R/train.R
@@ -32,9 +32,7 @@
         model_type <- .infer_model_type(model_name)
     }
     model_type <- gsub("-", "", tolower(model_type))
-    if (!model_type %in% c("albert", "bert", "bertweet", "bigbird", "camembert", "deberta", "distilbert", "electra", "flaubert",
-                           "herbert", "layoutlm", "layoutlmv2", "longformer", "mpnet", "mobilebert", "rembert", "roberta", "squeezebert",
-                           "squeezebert", "xlm", "xlmroberta", "xlnet", "debertav2")) {
+    if (!model_type %in% grafzahl::supported_model_types) {
         stop("Invalid `model_type`.", call. = FALSE)
     }
     return(model_type)    
@@ -118,7 +116,7 @@
 #' @param train_size numeric, proportion of data in `x` and `y` to be used actually for training. The rest will be used for cross validation.
 #' @param args list, additionally parameters to be used in the underlying simple transformers
 #' @param cleanup logical, if `TRUE`, the `runs` directory generated will be removed when the training is done
-#' @param model_type a string indicating model_type of the input model. If `NULL`, it will be inferred from `model_name`. It can only be one of the following: "albert", "bert", "bertweet", "bigbird", "camembert", "deberta", "debertav2", "distilbert", "electra", "flaubert", "herbert", "layoutlm", "layoutlmv2", "longformer", "mpnet", "mobilebert", "rembert", "roberta", "squeezebert", "squeezebert", "xlm", "xlmroberta", "xlnet". This will be lowercased and hyphens will be removed, e.g. "XLM-RoBERTa" will be normalized to "xlmroberta".
+#' @param model_type a string indicating model_type of the input model. If `NULL`, it will be inferred from `model_name`. Supported model types are available in [supported_model_types].
 #' @param manual_seed numeric, random seed
 #' @param verbose logical, if `TRUE`, debug messages will be displayed
 #' @param ... paramters pass to [grafzahl()]

diff --git a/data/supported_model_types.rda b/data/supported_model_types.rda
diff --git a/man/grafzahl.Rd b/man/grafzahl.Rd
diff --git a/man/hydrate.Rd b/man/hydrate.Rd
diff --git a/man/supported_model_types.Rd b/man/supported_model_types.Rd
diff --git a/rawdata/createdata.R b/rawdata/createdata.R
@@ -22,3 +22,8 @@ download.file(url <- "https://raw.githubusercontent.com/vanatteveldt/ecosent/mas
 
 ecosent <- read.csv("rawdata/sentences_ml.csv", encoding = "UTF-8")[c("id", "headline", "value", "gold")]
 save(ecosent, file = "data/ecosent.rda", ascii = FALSE, compress = "xz")
+
+supported_model_types <- c("albert", "bert", "bertweet", "bigbird", "camembert", "deberta", "distilbert", "electra", "flaubert",
+                           "herbert", "layoutlm", "layoutlmv2", "longformer", "mpnet", "mobilebert", "rembert", "roberta", "squeezebert",
+                           "squeezebert", "xlm", "xlmroberta", "xlnet", "debertav2")
+usethis::use_data(supported_model_types)