version 0.1.0

cran · Jul 6, 2023 · 65cfc4f · 65cfc4f
commit 65cfc4f
Show file tree

Hide file tree

Showing 27 changed files with 1,238 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,29 @@
+Package: tok
+Title: Fast Text Tokenization
+Version: 0.1.0
+Authors@R: c(
+    person("Daniel", "Falbel", , "daniel@posit.co", c("aut", "cre")),
+    person(family = "Posit", role = c("cph"))
+  )
+Description: 
+  Interfaces with the 'Hugging Face' tokenizers library to provide implementations
+  of today's most used tokenizers such as the 'Byte-Pair Encoding' algorithm 
+  <https://huggingface.co/docs/tokenizers/index>. It's extremely fast for both 
+  training new vocabularies and tokenizing texts.
+License: MIT + file LICENSE
+SystemRequirements: Rust tool chain w/ cargo, libclang/llvm-config
+Encoding: UTF-8
+RoxygenNote: 7.2.3
+Depends: R (>= 4.2.0)
+Imports: R6, cli
+Suggests: rmarkdown, testthat (>= 3.0.0), hfhub, withr
+Config/testthat/edition: 3
+URL: https://github.com/mlverse/tok
+BugReports: https://github.com/mlverse/tok/issues
+NeedsCompilation: yes
+Packaged: 2023-07-03 09:26:57 UTC; dfalbel
+Author: Daniel Falbel [aut, cre],
+  Posit [cph]
+Maintainer: Daniel Falbel <daniel@posit.co>
+Repository: CRAN
+Date/Publication: 2023-07-06 13:00:02 UTC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2020
+COPYRIGHT HOLDER: Andy Thomason, Claus O. Wilke
diff --git a/MD5 b/MD5
@@ -0,0 +1,26 @@
+58d6d391a293edbe1098ad59198bfe09 *DESCRIPTION
+dc86c093012c1c9bc880868b901be232 *LICENSE
+fcc25a83b7b824a694ff8ad45647184b *NAMESPACE
+e6fe63f7a2b14f97721a3b54cdeaed75 *NEWS.md
+8a3e4798d33e4cfd4d87bf442a3ff5aa *R/encoding.R
+7d9eea467eb07f9126080845853c602d *R/extendr-wrappers.R
+c89885fe34136c1e19109a4f87f70dd0 *R/tokenizer.R
+6745ca9f9bdc7659879961f99a4b1c86 *README.md
+f53249ee287578ae7f2f4ccf5e6b6e58 *man/encoding.Rd
+a55cdc97b5bc049f612956d00e6205fb *man/tokenizer.Rd
+5bafe8d61460592f39da4f46a7b2d880 *src/Makevars
+27957123fba16c05c53a66e68b0f3482 *src/Makevars.ucrt
+4d12fbf066fb357db1e202bfc7f60bab *src/Makevars.win
+e475c45acff0aec8451fa9e0d2ade36c *src/entrypoint.c
+1f0e223e49275d991ce055fe1a2f94d4 *src/rust/Cargo.lock
+e3918eb64f14ca1a9bc3bb86e15f09f1 *src/rust/Cargo.toml
+27388a92accaf37b1e00cadbf27606c7 *src/rust/src/lib.rs
+b63a13cf1a63ac2027013f19e8dfec21 *src/rust/src/models.rs
+785bf0176dc6658a32a8167231992603 *src/rust/src/tokenizer.rs
+8abdb06a050a4f52cfe7b40b914b9fa9 *src/rust/src/trainers.rs
+ba398b93085f305afd0f06437e58899f *src/tok-win.def
+2fa9c1f6e421b6a79ceeee8fc410730b *tests/testthat.R
+a379162c68dc3aac382a2d98facf4c44 *tests/testthat/assets/tokenizer.json
+dcf23f7ae3377b349c5c54906d460a8b *tests/testthat/test-encoding.R
+26b1d6133f14188b089d5968b847b3ca *tests/testthat/test-tokenizer.R
+223376a3d5a56e02e6624880bc121a1f *tools/patch.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,15 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method("$",REncoding)
+S3method("$",RModel)
+S3method("$",RModelBPE)
+S3method("$",RTokenizer)
+S3method("[[",REncoding)
+S3method("[[",RModel)
+S3method("[[",RModelBPE)
+S3method("[[",RTokenizer)
+export(encoding)
+export(tokenizer)
+importFrom(R6,R6Class)
+importFrom(cli,cli_abort)
+useDynLib(tok, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# tok 0.1.0
+
+* Initial CRAN submission.
diff --git a/R/encoding.R b/R/encoding.R
@@ -0,0 +1,48 @@
+#' Encoding
+#' @description 
+#' Represents the output of a [tokenizer].
+#' 
+#' @examples
+#' withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = tempdir()), {
+#' try({
+#' tok <- tokenizer$from_pretrained("gpt2")
+#' encoding <- tok$encode("Hello world")
+#' encoding
+#' })
+#' })
+#' @returns
+#' An encoding object containing encoding information such as attention masks
+#' and token ids.
+#' 
+#' @export
+encoding <- R6::R6Class(
+  classname = "tok_encoding",
+  public = list(
+    #' @field .encoding The underlying implementation pointer.
+    .encoding = NULL,
+    #' @description Initializes an encoding object (Not to use directly)
+    #' @param encoding an encoding implementation object
+    initialize = function(encoding) {
+      if (inherits(encoding, "REncoding")) {
+        self$.encoding <- encoding
+      } else {
+        cli::cli_abort("Expected class {.cls REncoding} but got {.cls {class(encoding)}}.")
+      }
+    }  
+  ),
+  active = list(
+    #' @field ids The IDs are the main input to a Language Model. They are the 
+    #'  token indices, the numerical representations that a LM understands.
+    ids = function(x) {
+      if (missing(x)) {
+        self$.encoding$get_ids()
+      }
+    },
+    #' @field attention_mask The attention mask used as input for transformers models.
+    attention_mask = function(x) {
+      if (missing(x)) {
+        self$.encoding$get_attention_mask()
+      }
+    }
+  )
+)
diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -0,0 +1,72 @@
+# Generated by extendr: Do not edit by hand
+#
+# This file was created with the following call:
+#   .Call("wrap__make_tok_wrappers", use_symbols = TRUE, package_name = "tok")
+
+#' @docType package
+#' @usage NULL
+#' @useDynLib tok, .registration = TRUE
+NULL
+
+RModel <- new.env(parent = emptyenv())
+
+RModel$new <- function(model) .Call(wrap__RModel__new, model)
+
+#' @export
+`$.RModel` <- function (self, name) { func <- RModel[[name]]; environment(func) <- environment(); func }
+
+#' @export
+`[[.RModel` <- `$.RModel`
+
+RModelBPE <- new.env(parent = emptyenv())
+
+RModelBPE$new <- function(vocab, merges) .Call(wrap__RModelBPE__new, vocab, merges)
+
+#' @export
+`$.RModelBPE` <- function (self, name) { func <- RModelBPE[[name]]; environment(func) <- environment(); func }
+
+#' @export
+`[[.RModelBPE` <- `$.RModelBPE`
+
+REncoding <- new.env(parent = emptyenv())
+
+REncoding$len <- function() .Call(wrap__REncoding__len, self)
+
+REncoding$get_ids <- function() .Call(wrap__REncoding__get_ids, self)
+
+REncoding$get_tokens <- function() .Call(wrap__REncoding__get_tokens, self)
+
+REncoding$get_type_ids <- function() .Call(wrap__REncoding__get_type_ids, self)
+
+REncoding$get_attention_mask <- function() .Call(wrap__REncoding__get_attention_mask, self)
+
+REncoding$get_special_tokens_mask <- function() .Call(wrap__REncoding__get_special_tokens_mask, self)
+
+REncoding$get_word_ids <- function() .Call(wrap__REncoding__get_word_ids, self)
+
+#' @export
+`$.REncoding` <- function (self, name) { func <- REncoding[[name]]; environment(func) <- environment(); func }
+
+#' @export
+`[[.REncoding` <- `$.REncoding`
+
+RTokenizer <- new.env(parent = emptyenv())
+
+RTokenizer$new <- function(tokenizer) .Call(wrap__RTokenizer__new, tokenizer)
+
+RTokenizer$from_file <- function(path) .Call(wrap__RTokenizer__from_file, path)
+
+RTokenizer$encode <- function(sequence, pair, is_pretokenized, add_special_tokens) .Call(wrap__RTokenizer__encode, self, sequence, pair, is_pretokenized, add_special_tokens)
+
+RTokenizer$decode <- function(ids, skip_special_tokens) .Call(wrap__RTokenizer__decode, self, ids, skip_special_tokens)
+
+RTokenizer$encode_batch <- function(input, is_pretokenized, add_special_tokens) .Call(wrap__RTokenizer__encode_batch, self, input, is_pretokenized, add_special_tokens)
+
+RTokenizer$decode_batch <- function(ids, skip_special_tokens) .Call(wrap__RTokenizer__decode_batch, self, ids, skip_special_tokens)
+
+#' @export
+`$.RTokenizer` <- function (self, name) { func <- RTokenizer[[name]]; environment(func) <- environment(); func }
+
+#' @export
+`[[.RTokenizer` <- `$.RTokenizer`
+
diff --git a/R/tokenizer.R b/R/tokenizer.R
@@ -0,0 +1,113 @@
+#' Tokenizer
+#' @description 
+#' A Tokenizer works as a pipeline. It processes some raw text as input and outputs 
+#' an [encoding].
+#' 
+#' @importFrom R6 R6Class
+#' @importFrom cli cli_abort
+#' 
+#' @examples
+#' withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = tempdir()), {
+#' try({
+#' tok <- tokenizer$from_pretrained("gpt2")
+#' tok$encode("Hello world")$ids
+#' })
+#' })
+#' 
+#' @returns
+#' A tokenizer that can be used for encoding character strings or decoding
+#' integers.
+#' 
+#' @export
+tokenizer <- R6::R6Class(
+  classname = "tok_tokenizer",
+  public = list(
+    #' @field .tokenizer (unsafe usage) Lower level pointer to tokenizer
+    .tokenizer = NULL,
+
+    #' @description Initializes a tokenizer
+    #' @param tokenizer Will be cloned to initialize a new tokenizer
+    initialize = function(tokenizer) {
+      if (inherits(tokenizer, "RTokenizer")) {
+        self$.tokenizer <- tokenizer
+      } else {
+        self$.tokenizer <- RTokenizer$new(tokenizer$.tokenizer)
+      }
+    },
+
+    #' @description 
+    #' Encode the given sequence and pair. This method can process raw text sequences
+    #' as well as already pre-tokenized sequences.
+    #' @param sequence The main input sequence we want to encode. This sequence can 
+    #'  be either raw text or pre-tokenized, according to the is_pretokenized argument
+    #' @param pair An optional input sequence. The expected format is the same 
+    #'  that for sequence.
+    #' @param is_pretokenized Whether the input is already pre-tokenized
+    #' @param add_special_tokens Whether to add the special tokens
+    encode = function(sequence, pair = NULL, is_pretokenized = FALSE, add_special_tokens = TRUE) {
+      self$.tokenizer$encode(sequence, pair, is_pretokenized, add_special_tokens)
+    },
+
+    #' @description 
+    #' Decode the given list of ids back to a string
+    #' @param ids The list of ids that we want to decode
+    #' @param skip_special_tokens Whether the special tokens should be removed from the decoded string
+    decode = function(ids, skip_special_tokens = TRUE) {
+      self$.tokenizer$decode(ids, skip_special_tokens)
+    },
+
+    #' @description 
+    #' Encodes a batch of sequences. Returns a list of [encoding]s.
+    #' @param input A list of single sequences or pair sequences to encode. Each 
+    #'  sequence can be either raw text or pre-tokenized, according to the is_pretokenized 
+    #'  argument.
+    #' @param is_pretokenized Whether the input is already pre-tokenized
+    #' @param add_special_tokens Whether to add the special tokens
+    encode_batch = function(input, is_pretokenized = FALSE, add_special_tokens = TRUE) {
+      self$.tokenizer$encode_batch(input, is_pretokenized, add_special_tokens)
+    },
+
+    #' @description 
+    #' Decode a batch of ids back to their corresponding string
+    #' @param sequences The batch of sequences we want to decode
+    #' @param skip_special_tokens Whether the special tokens should be removed from the decoded strings
+    decode_batch = function(sequences, skip_special_tokens = TRUE) {
+      self$.tokenizer$decode_batch(sequences, skip_special_tokens)
+    },
+
+    #' @description 
+    #' Creates a tokenizer from the path of a serialized tokenizer.
+    #' This is a static method and should be called instead of `$new` when initializing
+    #' the tokenizer.
+    #' @param path Path to tokenizer.json file
+    from_file = function(path) {
+      cli::cli_abort("This is a static method. Not available for tokenizers instances.")
+    },
+
+    #' @description 
+    #' Instantiate a new Tokenizer from an existing file on the Hugging Face Hub.
+    #' @param identifier The identifier of a Model on the Hugging Face Hub, that 
+    #'    contains a tokenizer.json file
+    #' @param revision  A branch or commit id
+    #' @param auth_token An optional auth token used to access private repositories 
+    #'    on the Hugging Face Hub
+    from_pretrained = function(identifier, revision = "main", auth_token = NULL) {
+      cli::cli_abort("This is a static method. Not available for tokenizers instances.")
+    }
+  )
+)
+
+tokenizer$from_file <- function(path) {
+  path <- path.expand(path)
+  tokenizer$new(RTokenizer$from_file(path))
+}
+
+tokenizer$from_pretrained <- function(identifier, revision = "main", auth_token = NULL) {
+  if (!is.null(auth_token))
+    cli::cli_abort("{.var auth_token} is currently unsupported.")
+
+  rlang::check_installed("hfhub")
+  path <- hfhub::hub_download(identifier, revision = revision, "tokenizer.json")
+
+  tokenizer$new(RTokenizer$from_file(path))
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,80 @@
+
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+
+# tok
+
+<!-- badges: start -->
+
+[![R build
+status](https://github.com/mlverse/tok/workflows/R-CMD-check/badge.svg)](https://github.com/mlverse/tok/actions)
+<!-- badges: end -->
+
+tok provides bindings to the
+[🤗tokenizers](https://huggingface.co/docs/tokenizers/v0.13.3/en/index)
+library. It uses the same Rust libraries that powers the Python
+implementation.
+
+We still don’t provide the full API of tokenizers. Please open a issue
+if there’s a feature you are missing.
+
+## Installation
+
+You can install tok from CRAN using:
+
+    install.packages("tok")
+
+Installing tok from source requires working Rust toolchain. We recommend
+using [rustup.](https://rustup.rs/)
+
+On Windows, you’ll also have to add the `i686-pc-windows-gnu` and
+`x86_64-pc-windows-gnu` targets:
+
+    rustup target add x86_64-pc-windows-gnu
+    rustup target add i686-pc-windows-gnu
+
+Once Rust is working, you can install this package via:
+
+``` r
+remotes::install_github("dfalbel/tok")
+```
+
+## Features
+
+We still don’t have complete support for the 🤗tokenizers API. Please
+open an issue if you need a feature that is currently not implemented.
+
+## Loading tokenizers
+
+`tok` can be used to load and use tokenizers that have been previously
+serialized. For example, HuggingFace model weights are usually
+accompanied by a ‘tokenizer.json’ file that can be loaded with this
+library.
+
+To load a pre-trained tokenizer from a json file, use:
+
+``` r
+path <- testthat::test_path("assets/tokenizer.json")
+tok <- tok::tokenizer$from_file(path)
+```
+
+Use the `encode` method to tokenize sentendes and `decode` to transform
+them back.
+
+``` r
+enc <- tok$encode("hello world")
+tok$decode(enc$ids)
+#> [1] "hello world"
+```
+
+## Using pre-trained tokenizers
+
+You can also load any tokenizer available in HuggingFace hub by using
+the `from_pretrained` static method. For example, let’s load the GPT2
+tokenizer with:
+
+``` r
+tok <- tok::tokenizer$from_pretrained("gpt2")
+enc <- tok$encode("hello world")
+tok$decode(enc$ids)
+#> [1] "hello world"
+```