From e3dfc2fad4565873d807ba820a24094b97890f98 Mon Sep 17 00:00:00 2001 From: Hiroki Tagato Date: Mon, 12 Feb 2024 17:28:03 +0900 Subject: [PATCH] textproc/py-tokenizers: add port: Fast state-of-the-art tokenizers optimized for research and production Provides an implementation of today's most used tokenizers, with a focus on performance and versatility. Main features: - Train new vocabularies and tokenize, using today's most used tokenizers. - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes less than 20 seconds to tokenize a GB of text on a server's CPU. - Easy to use, but also extremely versatile. - Designed for research and production. - Normalization comes with alignments tracking. It's always possible to get the part of the original sentence that corresponds to a given token. - Does all the pre-processing: Truncate, Pad, add the special tokens your model needs. WWW: https://github.com/huggingface/tokenizers --- textproc/Makefile | 1 + textproc/py-tokenizers/Makefile | 29 +++ textproc/py-tokenizers/Makefile.crates | 149 ++++++++++++ textproc/py-tokenizers/distinfo | 301 +++++++++++++++++++++++++ textproc/py-tokenizers/pkg-descr | 16 ++ 5 files changed, 496 insertions(+) create mode 100644 textproc/py-tokenizers/Makefile create mode 100644 textproc/py-tokenizers/Makefile.crates create mode 100644 textproc/py-tokenizers/distinfo create mode 100644 textproc/py-tokenizers/pkg-descr diff --git a/textproc/Makefile b/textproc/Makefile index c51bc706f8c60..8cabafff97f38 100644 --- a/textproc/Makefile +++ b/textproc/Makefile @@ -1618,6 +1618,7 @@ SUBDIR += py-tiktoken SUBDIR += py-tinycss SUBDIR += py-tinycss2 + SUBDIR += py-tokenizers SUBDIR += py-toml SUBDIR += py-tomli SUBDIR += py-tomli-w diff --git a/textproc/py-tokenizers/Makefile b/textproc/py-tokenizers/Makefile new file mode 100644 index 0000000000000..4447eab63c382 --- /dev/null +++ b/textproc/py-tokenizers/Makefile @@ -0,0 +1,29 @@ +PORTNAME= tokenizers +DISTVERSION= 0.15.1 +CATEGORIES= textproc python +MASTER_SITES= PYPI +PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX} +DISTFILES= ${PORTNAME}-${PORTVERSION}${EXTRACT_SUFX} + +MAINTAINER= tagattie@FreeBSD.org +COMMENT= Fast state-of-the-art tokenizers optimized for research and production +WWW= https://github.com/huggingface/tokenizers + +LICENSE= APACHE20 + +BUILD_DEPENDS= ${PYTHON_PKGNAMEPREFIX}maturin>=1.0<2.0:devel/py-maturin@${PY_FLAVOR} +RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}huggingface-hub>=0.16.4<1.0:misc/py-huggingface-hub@${PY_FLAVOR} + +USES= cargo python +USE_PYTHON= autoplist pep517 + +CARGO_CARGOTOML=${WRKSRC}/bindings/python/Cargo.toml +CARGO_CARGOLOCK=${WRKSRC}/bindings/python/Cargo.lock +CARGO_BUILD= no +CARGO_INSTALL= no +CARGO_TEST= no + +post-install: + @${FIND} ${STAGEDIR}${PYTHON_SITELIBDIR} -type f -name '*.so' -exec ${STRIP_CMD} {} ';' + +.include diff --git a/textproc/py-tokenizers/Makefile.crates b/textproc/py-tokenizers/Makefile.crates new file mode 100644 index 0000000000000..80ac75f5ceca5 --- /dev/null +++ b/textproc/py-tokenizers/Makefile.crates @@ -0,0 +1,149 @@ +CARGO_CRATES= aho-corasick-1.1.2 \ + anstream-0.6.5 \ + anstyle-1.0.4 \ + anstyle-parse-0.2.3 \ + anstyle-query-1.0.2 \ + anstyle-wincon-3.0.2 \ + autocfg-1.1.0 \ + base64-0.13.1 \ + bitflags-1.3.2 \ + bitflags-2.4.1 \ + cc-1.0.83 \ + cfg-if-1.0.0 \ + clap-4.4.11 \ + clap_builder-4.4.11 \ + clap_derive-4.4.7 \ + clap_lex-0.6.0 \ + colorchoice-1.0.0 \ + console-0.15.7 \ + crossbeam-deque-0.8.4 \ + crossbeam-epoch-0.9.16 \ + crossbeam-utils-0.8.17 \ + darling-0.14.4 \ + darling_core-0.14.4 \ + darling_macro-0.14.4 \ + derive_builder-0.12.0 \ + derive_builder_core-0.12.0 \ + derive_builder_macro-0.12.0 \ + either-1.9.0 \ + encode_unicode-0.3.6 \ + env_logger-0.10.1 \ + errno-0.3.8 \ + esaxx-rs-0.1.10 \ + fastrand-2.0.1 \ + fnv-1.0.7 \ + getrandom-0.2.11 \ + heck-0.4.1 \ + hermit-abi-0.3.3 \ + humantime-2.1.0 \ + ident_case-1.0.1 \ + indicatif-0.17.7 \ + indoc-2.0.4 \ + instant-0.1.12 \ + is-terminal-0.4.9 \ + itertools-0.11.0 \ + itoa-1.0.10 \ + lazy_static-1.4.0 \ + libc-0.2.151 \ + linux-raw-sys-0.4.12 \ + lock_api-0.4.11 \ + log-0.4.20 \ + macro_rules_attribute-0.2.0 \ + macro_rules_attribute-proc_macro-0.2.0 \ + matrixmultiply-0.3.8 \ + memchr-2.6.4 \ + memoffset-0.9.0 \ + minimal-lexical-0.2.1 \ + monostate-0.1.10 \ + monostate-impl-0.1.10 \ + ndarray-0.15.6 \ + nom-7.1.3 \ + num-complex-0.4.4 \ + num-integer-0.1.45 \ + num-traits-0.2.17 \ + number_prefix-0.4.0 \ + numpy-0.20.0 \ + once_cell-1.19.0 \ + onig-6.4.0 \ + onig_sys-69.8.1 \ + parking_lot-0.12.1 \ + parking_lot_core-0.9.9 \ + paste-1.0.14 \ + pkg-config-0.3.27 \ + portable-atomic-1.6.0 \ + ppv-lite86-0.2.17 \ + proc-macro2-1.0.70 \ + pyo3-0.20.2 \ + pyo3-build-config-0.20.2 \ + pyo3-ffi-0.20.2 \ + pyo3-macros-0.20.2 \ + pyo3-macros-backend-0.20.2 \ + quote-1.0.33 \ + rand-0.8.5 \ + rand_chacha-0.3.1 \ + rand_core-0.6.4 \ + rawpointer-0.2.1 \ + rayon-1.8.0 \ + rayon-cond-0.3.0 \ + rayon-core-1.12.0 \ + redox_syscall-0.4.1 \ + regex-1.10.2 \ + regex-automata-0.4.3 \ + regex-syntax-0.7.5 \ + regex-syntax-0.8.2 \ + rustc-hash-1.1.0 \ + rustix-0.38.28 \ + ryu-1.0.16 \ + scopeguard-1.2.0 \ + serde-1.0.193 \ + serde_derive-1.0.193 \ + serde_json-1.0.108 \ + smallvec-1.11.2 \ + spm_precompiled-0.1.4 \ + strsim-0.10.0 \ + syn-1.0.109 \ + syn-2.0.41 \ + target-lexicon-0.12.12 \ + tempfile-3.8.1 \ + termcolor-1.4.0 \ + thiserror-1.0.51 \ + thiserror-impl-1.0.51 \ + unicode-ident-1.0.12 \ + unicode-normalization-alignments-0.1.12 \ + unicode-segmentation-1.10.1 \ + unicode-width-0.1.11 \ + unicode_categories-0.1.1 \ + unindent-0.2.3 \ + utf8parse-0.2.1 \ + wasi-0.11.0+wasi-snapshot-preview1 \ + winapi-0.3.9 \ + winapi-i686-pc-windows-gnu-0.4.0 \ + winapi-util-0.1.6 \ + winapi-x86_64-pc-windows-gnu-0.4.0 \ + windows-sys-0.45.0 \ + windows-sys-0.48.0 \ + windows-sys-0.52.0 \ + windows-targets-0.42.2 \ + windows-targets-0.48.5 \ + windows-targets-0.52.0 \ + windows_aarch64_gnullvm-0.42.2 \ + windows_aarch64_gnullvm-0.48.5 \ + windows_aarch64_gnullvm-0.52.0 \ + windows_aarch64_msvc-0.42.2 \ + windows_aarch64_msvc-0.48.5 \ + windows_aarch64_msvc-0.52.0 \ + windows_i686_gnu-0.42.2 \ + windows_i686_gnu-0.48.5 \ + windows_i686_gnu-0.52.0 \ + windows_i686_msvc-0.42.2 \ + windows_i686_msvc-0.48.5 \ + windows_i686_msvc-0.52.0 \ + windows_x86_64_gnu-0.42.2 \ + windows_x86_64_gnu-0.48.5 \ + windows_x86_64_gnu-0.52.0 \ + windows_x86_64_gnullvm-0.42.2 \ + windows_x86_64_gnullvm-0.48.5 \ + windows_x86_64_gnullvm-0.52.0 \ + windows_x86_64_msvc-0.42.2 \ + windows_x86_64_msvc-0.48.5 \ + windows_x86_64_msvc-0.52.0 diff --git a/textproc/py-tokenizers/distinfo b/textproc/py-tokenizers/distinfo new file mode 100644 index 0000000000000..6b821159c95f5 --- /dev/null +++ b/textproc/py-tokenizers/distinfo @@ -0,0 +1,301 @@ +TIMESTAMP = 1707702588 +SHA256 (tokenizers-0.15.1.tar.gz) = c0a331d6d5a3d6e97b7f99f562cee8d56797180797bc55f12070e495e717c980 +SIZE (tokenizers-0.15.1.tar.gz) = 320398 +SHA256 (rust/crates/aho-corasick-1.1.2.crate) = b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0 +SIZE (rust/crates/aho-corasick-1.1.2.crate) = 183136 +SHA256 (rust/crates/anstream-0.6.5.crate) = d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6 +SIZE (rust/crates/anstream-0.6.5.crate) = 30004 +SHA256 (rust/crates/anstyle-1.0.4.crate) = 7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87 +SIZE (rust/crates/anstyle-1.0.4.crate) = 13998 +SHA256 (rust/crates/anstyle-parse-0.2.3.crate) = c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c +SIZE (rust/crates/anstyle-parse-0.2.3.crate) = 24699 +SHA256 (rust/crates/anstyle-query-1.0.2.crate) = e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648 +SIZE (rust/crates/anstyle-query-1.0.2.crate) = 8739 +SHA256 (rust/crates/anstyle-wincon-3.0.2.crate) = 1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7 +SIZE (rust/crates/anstyle-wincon-3.0.2.crate) = 11272 +SHA256 (rust/crates/autocfg-1.1.0.crate) = d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa +SIZE (rust/crates/autocfg-1.1.0.crate) = 13272 +SHA256 (rust/crates/base64-0.13.1.crate) = 9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8 +SIZE (rust/crates/base64-0.13.1.crate) = 61002 +SHA256 (rust/crates/bitflags-1.3.2.crate) = bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a +SIZE (rust/crates/bitflags-1.3.2.crate) = 23021 +SHA256 (rust/crates/bitflags-2.4.1.crate) = 327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07 +SIZE (rust/crates/bitflags-2.4.1.crate) = 37043 +SHA256 (rust/crates/cc-1.0.83.crate) = f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0 +SIZE (rust/crates/cc-1.0.83.crate) = 68343 +SHA256 (rust/crates/cfg-if-1.0.0.crate) = baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd +SIZE (rust/crates/cfg-if-1.0.0.crate) = 7934 +SHA256 (rust/crates/clap-4.4.11.crate) = bfaff671f6b22ca62406885ece523383b9b64022e341e53e009a62ebc47a45f2 +SIZE (rust/crates/clap-4.4.11.crate) = 54782 +SHA256 (rust/crates/clap_builder-4.4.11.crate) = a216b506622bb1d316cd51328dce24e07bdff4a6128a47c7e7fad11878d5adbb +SIZE (rust/crates/clap_builder-4.4.11.crate) = 163317 +SHA256 (rust/crates/clap_derive-4.4.7.crate) = cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442 +SIZE (rust/crates/clap_derive-4.4.7.crate) = 29046 +SHA256 (rust/crates/clap_lex-0.6.0.crate) = 702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1 +SIZE (rust/crates/clap_lex-0.6.0.crate) = 12272 +SHA256 (rust/crates/colorchoice-1.0.0.crate) = acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7 +SIZE (rust/crates/colorchoice-1.0.0.crate) = 6857 +SHA256 (rust/crates/console-0.15.7.crate) = c926e00cc70edefdc64d3a5ff31cc65bb97a3460097762bd23afb4d8145fccf8 +SIZE (rust/crates/console-0.15.7.crate) = 35409 +SHA256 (rust/crates/crossbeam-deque-0.8.4.crate) = fca89a0e215bab21874660c67903c5f143333cab1da83d041c7ded6053774751 +SIZE (rust/crates/crossbeam-deque-0.8.4.crate) = 21752 +SHA256 (rust/crates/crossbeam-epoch-0.9.16.crate) = 2d2fe95351b870527a5d09bf563ed3c97c0cffb87cf1c78a591bf48bb218d9aa +SIZE (rust/crates/crossbeam-epoch-0.9.16.crate) = 47037 +SHA256 (rust/crates/crossbeam-utils-0.8.17.crate) = c06d96137f14f244c37f989d9fff8f95e6c18b918e71f36638f8c49112e4c78f +SIZE (rust/crates/crossbeam-utils-0.8.17.crate) = 42324 +SHA256 (rust/crates/darling-0.14.4.crate) = 7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850 +SIZE (rust/crates/darling-0.14.4.crate) = 25168 +SHA256 (rust/crates/darling_core-0.14.4.crate) = 109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0 +SIZE (rust/crates/darling_core-0.14.4.crate) = 57485 +SHA256 (rust/crates/darling_macro-0.14.4.crate) = a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e +SIZE (rust/crates/darling_macro-0.14.4.crate) = 1896 +SHA256 (rust/crates/derive_builder-0.12.0.crate) = 8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8 +SIZE (rust/crates/derive_builder-0.12.0.crate) = 35456 +SHA256 (rust/crates/derive_builder_core-0.12.0.crate) = c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f +SIZE (rust/crates/derive_builder_core-0.12.0.crate) = 31438 +SHA256 (rust/crates/derive_builder_macro-0.12.0.crate) = ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e +SIZE (rust/crates/derive_builder_macro-0.12.0.crate) = 6288 +SHA256 (rust/crates/either-1.9.0.crate) = a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07 +SIZE (rust/crates/either-1.9.0.crate) = 16660 +SHA256 (rust/crates/encode_unicode-0.3.6.crate) = a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f +SIZE (rust/crates/encode_unicode-0.3.6.crate) = 45741 +SHA256 (rust/crates/env_logger-0.10.1.crate) = 95b3f3e67048839cb0d0781f445682a35113da7121f7c949db0e2be96a4fbece +SIZE (rust/crates/env_logger-0.10.1.crate) = 36524 +SHA256 (rust/crates/errno-0.3.8.crate) = a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245 +SIZE (rust/crates/errno-0.3.8.crate) = 10645 +SHA256 (rust/crates/esaxx-rs-0.1.10.crate) = d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6 +SIZE (rust/crates/esaxx-rs-0.1.10.crate) = 175210 +SHA256 (rust/crates/fastrand-2.0.1.crate) = 25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5 +SIZE (rust/crates/fastrand-2.0.1.crate) = 14664 +SHA256 (rust/crates/fnv-1.0.7.crate) = 3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1 +SIZE (rust/crates/fnv-1.0.7.crate) = 11266 +SHA256 (rust/crates/getrandom-0.2.11.crate) = fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f +SIZE (rust/crates/getrandom-0.2.11.crate) = 35391 +SHA256 (rust/crates/heck-0.4.1.crate) = 95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8 +SIZE (rust/crates/heck-0.4.1.crate) = 11567 +SHA256 (rust/crates/hermit-abi-0.3.3.crate) = d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7 +SIZE (rust/crates/hermit-abi-0.3.3.crate) = 14253 +SHA256 (rust/crates/humantime-2.1.0.crate) = 9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4 +SIZE (rust/crates/humantime-2.1.0.crate) = 16749 +SHA256 (rust/crates/ident_case-1.0.1.crate) = b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39 +SIZE (rust/crates/ident_case-1.0.1.crate) = 3492 +SHA256 (rust/crates/indicatif-0.17.7.crate) = fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25 +SIZE (rust/crates/indicatif-0.17.7.crate) = 63108 +SHA256 (rust/crates/indoc-2.0.4.crate) = 1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8 +SIZE (rust/crates/indoc-2.0.4.crate) = 14311 +SHA256 (rust/crates/instant-0.1.12.crate) = 7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c +SIZE (rust/crates/instant-0.1.12.crate) = 6128 +SHA256 (rust/crates/is-terminal-0.4.9.crate) = cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b +SIZE (rust/crates/is-terminal-0.4.9.crate) = 8109 +SHA256 (rust/crates/itertools-0.11.0.crate) = b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57 +SIZE (rust/crates/itertools-0.11.0.crate) = 125074 +SHA256 (rust/crates/itoa-1.0.10.crate) = b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c +SIZE (rust/crates/itoa-1.0.10.crate) = 10534 +SHA256 (rust/crates/lazy_static-1.4.0.crate) = e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646 +SIZE (rust/crates/lazy_static-1.4.0.crate) = 10443 +SHA256 (rust/crates/libc-0.2.151.crate) = 302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4 +SIZE (rust/crates/libc-0.2.151.crate) = 736640 +SHA256 (rust/crates/linux-raw-sys-0.4.12.crate) = c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456 +SIZE (rust/crates/linux-raw-sys-0.4.12.crate) = 1465800 +SHA256 (rust/crates/lock_api-0.4.11.crate) = 3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45 +SIZE (rust/crates/lock_api-0.4.11.crate) = 27487 +SHA256 (rust/crates/log-0.4.20.crate) = b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f +SIZE (rust/crates/log-0.4.20.crate) = 38307 +SHA256 (rust/crates/macro_rules_attribute-0.2.0.crate) = 8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13 +SIZE (rust/crates/macro_rules_attribute-0.2.0.crate) = 15408 +SHA256 (rust/crates/macro_rules_attribute-proc_macro-0.2.0.crate) = b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568 +SIZE (rust/crates/macro_rules_attribute-proc_macro-0.2.0.crate) = 8264 +SHA256 (rust/crates/matrixmultiply-0.3.8.crate) = 7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2 +SIZE (rust/crates/matrixmultiply-0.3.8.crate) = 57530 +SHA256 (rust/crates/memchr-2.6.4.crate) = f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167 +SIZE (rust/crates/memchr-2.6.4.crate) = 94439 +SHA256 (rust/crates/memoffset-0.9.0.crate) = 5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c +SIZE (rust/crates/memoffset-0.9.0.crate) = 9033 +SHA256 (rust/crates/minimal-lexical-0.2.1.crate) = 68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a +SIZE (rust/crates/minimal-lexical-0.2.1.crate) = 94841 +SHA256 (rust/crates/monostate-0.1.10.crate) = e404e13820ea0df0eda93aa294e0c80de76a0daa6bec590d376fbec6d7810394 +SIZE (rust/crates/monostate-0.1.10.crate) = 13986 +SHA256 (rust/crates/monostate-impl-0.1.10.crate) = 531c82a934da419bed3da09bd87d6e98c72f8d4aa755427b3b009c2b8b8c433c +SIZE (rust/crates/monostate-impl-0.1.10.crate) = 7187 +SHA256 (rust/crates/ndarray-0.15.6.crate) = adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32 +SIZE (rust/crates/ndarray-0.15.6.crate) = 275225 +SHA256 (rust/crates/nom-7.1.3.crate) = d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a +SIZE (rust/crates/nom-7.1.3.crate) = 117570 +SHA256 (rust/crates/num-complex-0.4.4.crate) = 1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214 +SIZE (rust/crates/num-complex-0.4.4.crate) = 29564 +SHA256 (rust/crates/num-integer-0.1.45.crate) = 225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9 +SIZE (rust/crates/num-integer-0.1.45.crate) = 22529 +SHA256 (rust/crates/num-traits-0.2.17.crate) = 39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c +SIZE (rust/crates/num-traits-0.2.17.crate) = 50190 +SHA256 (rust/crates/number_prefix-0.4.0.crate) = 830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3 +SIZE (rust/crates/number_prefix-0.4.0.crate) = 6922 +SHA256 (rust/crates/numpy-0.20.0.crate) = bef41cbb417ea83b30525259e30ccef6af39b31c240bda578889494c5392d331 +SIZE (rust/crates/numpy-0.20.0.crate) = 71258 +SHA256 (rust/crates/once_cell-1.19.0.crate) = 3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92 +SIZE (rust/crates/once_cell-1.19.0.crate) = 33046 +SHA256 (rust/crates/onig-6.4.0.crate) = 8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f +SIZE (rust/crates/onig-6.4.0.crate) = 32616 +SHA256 (rust/crates/onig_sys-69.8.1.crate) = 7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7 +SIZE (rust/crates/onig_sys-69.8.1.crate) = 638216 +SHA256 (rust/crates/parking_lot-0.12.1.crate) = 3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f +SIZE (rust/crates/parking_lot-0.12.1.crate) = 40967 +SHA256 (rust/crates/parking_lot_core-0.9.9.crate) = 4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e +SIZE (rust/crates/parking_lot_core-0.9.9.crate) = 32445 +SHA256 (rust/crates/paste-1.0.14.crate) = de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c +SIZE (rust/crates/paste-1.0.14.crate) = 18157 +SHA256 (rust/crates/pkg-config-0.3.27.crate) = 26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964 +SIZE (rust/crates/pkg-config-0.3.27.crate) = 18838 +SHA256 (rust/crates/portable-atomic-1.6.0.crate) = 7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0 +SIZE (rust/crates/portable-atomic-1.6.0.crate) = 140689 +SHA256 (rust/crates/ppv-lite86-0.2.17.crate) = 5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de +SIZE (rust/crates/ppv-lite86-0.2.17.crate) = 22242 +SHA256 (rust/crates/proc-macro2-1.0.70.crate) = 39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b +SIZE (rust/crates/proc-macro2-1.0.70.crate) = 44343 +SHA256 (rust/crates/pyo3-0.20.2.crate) = 9a89dc7a5850d0e983be1ec2a463a171d20990487c3cfcd68b5363f1ee3d6fe0 +SIZE (rust/crates/pyo3-0.20.2.crate) = 434326 +SHA256 (rust/crates/pyo3-build-config-0.20.2.crate) = 07426f0d8fe5a601f26293f300afd1a7b1ed5e78b2a705870c5f30893c5163be +SIZE (rust/crates/pyo3-build-config-0.20.2.crate) = 30029 +SHA256 (rust/crates/pyo3-ffi-0.20.2.crate) = dbb7dec17e17766b46bca4f1a4215a85006b4c2ecde122076c562dd058da6cf1 +SIZE (rust/crates/pyo3-ffi-0.20.2.crate) = 64601 +SHA256 (rust/crates/pyo3-macros-0.20.2.crate) = 05f738b4e40d50b5711957f142878cfa0f28e054aa0ebdfc3fd137a843f74ed3 +SIZE (rust/crates/pyo3-macros-0.20.2.crate) = 7925 +SHA256 (rust/crates/pyo3-macros-backend-0.20.2.crate) = 0fc910d4851847827daf9d6cdd4a823fbdaab5b8818325c5e97a86da79e8881f +SIZE (rust/crates/pyo3-macros-backend-0.20.2.crate) = 49710 +SHA256 (rust/crates/quote-1.0.33.crate) = 5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae +SIZE (rust/crates/quote-1.0.33.crate) = 28090 +SHA256 (rust/crates/rand-0.8.5.crate) = 34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404 +SIZE (rust/crates/rand-0.8.5.crate) = 87113 +SHA256 (rust/crates/rand_chacha-0.3.1.crate) = e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88 +SIZE (rust/crates/rand_chacha-0.3.1.crate) = 15251 +SHA256 (rust/crates/rand_core-0.6.4.crate) = ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c +SIZE (rust/crates/rand_core-0.6.4.crate) = 22666 +SHA256 (rust/crates/rawpointer-0.2.1.crate) = 60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3 +SIZE (rust/crates/rawpointer-0.2.1.crate) = 7490 +SHA256 (rust/crates/rayon-1.8.0.crate) = 9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1 +SIZE (rust/crates/rayon-1.8.0.crate) = 170172 +SHA256 (rust/crates/rayon-cond-0.3.0.crate) = 059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9 +SIZE (rust/crates/rayon-cond-0.3.0.crate) = 9913 +SHA256 (rust/crates/rayon-core-1.12.0.crate) = 5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed +SIZE (rust/crates/rayon-core-1.12.0.crate) = 70081 +SHA256 (rust/crates/redox_syscall-0.4.1.crate) = 4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa +SIZE (rust/crates/redox_syscall-0.4.1.crate) = 24858 +SHA256 (rust/crates/regex-1.10.2.crate) = 380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343 +SIZE (rust/crates/regex-1.10.2.crate) = 252839 +SHA256 (rust/crates/regex-automata-0.4.3.crate) = 5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f +SIZE (rust/crates/regex-automata-0.4.3.crate) = 617011 +SHA256 (rust/crates/regex-syntax-0.7.5.crate) = dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da +SIZE (rust/crates/regex-syntax-0.7.5.crate) = 343366 +SHA256 (rust/crates/regex-syntax-0.8.2.crate) = c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f +SIZE (rust/crates/regex-syntax-0.8.2.crate) = 347228 +SHA256 (rust/crates/rustc-hash-1.1.0.crate) = 08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2 +SIZE (rust/crates/rustc-hash-1.1.0.crate) = 9331 +SHA256 (rust/crates/rustix-0.38.28.crate) = 72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316 +SIZE (rust/crates/rustix-0.38.28.crate) = 365398 +SHA256 (rust/crates/ryu-1.0.16.crate) = f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c +SIZE (rust/crates/ryu-1.0.16.crate) = 47351 +SHA256 (rust/crates/scopeguard-1.2.0.crate) = 94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49 +SIZE (rust/crates/scopeguard-1.2.0.crate) = 11619 +SHA256 (rust/crates/serde-1.0.193.crate) = 25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89 +SIZE (rust/crates/serde-1.0.193.crate) = 76863 +SHA256 (rust/crates/serde_derive-1.0.193.crate) = 43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3 +SIZE (rust/crates/serde_derive-1.0.193.crate) = 55692 +SHA256 (rust/crates/serde_json-1.0.108.crate) = 3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b +SIZE (rust/crates/serde_json-1.0.108.crate) = 146476 +SHA256 (rust/crates/smallvec-1.11.2.crate) = 4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970 +SIZE (rust/crates/smallvec-1.11.2.crate) = 34801 +SHA256 (rust/crates/spm_precompiled-0.1.4.crate) = 5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326 +SIZE (rust/crates/spm_precompiled-0.1.4.crate) = 557527 +SHA256 (rust/crates/strsim-0.10.0.crate) = 73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623 +SIZE (rust/crates/strsim-0.10.0.crate) = 11355 +SHA256 (rust/crates/syn-1.0.109.crate) = 72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237 +SIZE (rust/crates/syn-1.0.109.crate) = 237611 +SHA256 (rust/crates/syn-2.0.41.crate) = 44c8b28c477cc3bf0e7966561e3460130e1255f7a1cf71931075f1c5e7a7e269 +SIZE (rust/crates/syn-2.0.41.crate) = 246016 +SHA256 (rust/crates/target-lexicon-0.12.12.crate) = 14c39fd04924ca3a864207c66fc2cd7d22d7c016007f9ce846cbb9326331930a +SIZE (rust/crates/target-lexicon-0.12.12.crate) = 25156 +SHA256 (rust/crates/tempfile-3.8.1.crate) = 7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5 +SIZE (rust/crates/tempfile-3.8.1.crate) = 32164 +SHA256 (rust/crates/termcolor-1.4.0.crate) = ff1bc3d3f05aff0403e8ac0d92ced918ec05b666a43f83297ccef5bea8a3d449 +SIZE (rust/crates/termcolor-1.4.0.crate) = 18765 +SHA256 (rust/crates/thiserror-1.0.51.crate) = f11c217e1416d6f036b870f14e0413d480dbf28edbee1f877abaf0206af43bb7 +SIZE (rust/crates/thiserror-1.0.51.crate) = 20045 +SHA256 (rust/crates/thiserror-impl-1.0.51.crate) = 01742297787513b79cf8e29d1056ede1313e2420b7b3b15d0a768b4921f549df +SIZE (rust/crates/thiserror-impl-1.0.51.crate) = 15372 +SHA256 (rust/crates/unicode-ident-1.0.12.crate) = 3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b +SIZE (rust/crates/unicode-ident-1.0.12.crate) = 42168 +SHA256 (rust/crates/unicode-normalization-alignments-0.1.12.crate) = 43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de +SIZE (rust/crates/unicode-normalization-alignments-0.1.12.crate) = 91546 +SHA256 (rust/crates/unicode-segmentation-1.10.1.crate) = 1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36 +SIZE (rust/crates/unicode-segmentation-1.10.1.crate) = 98416 +SHA256 (rust/crates/unicode-width-0.1.11.crate) = e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85 +SIZE (rust/crates/unicode-width-0.1.11.crate) = 19187 +SHA256 (rust/crates/unicode_categories-0.1.1.crate) = 39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e +SIZE (rust/crates/unicode_categories-0.1.1.crate) = 87298 +SHA256 (rust/crates/unindent-0.2.3.crate) = c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce +SIZE (rust/crates/unindent-0.2.3.crate) = 7306 +SHA256 (rust/crates/utf8parse-0.2.1.crate) = 711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a +SIZE (rust/crates/utf8parse-0.2.1.crate) = 13435 +SHA256 (rust/crates/wasi-0.11.0+wasi-snapshot-preview1.crate) = 9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423 +SIZE (rust/crates/wasi-0.11.0+wasi-snapshot-preview1.crate) = 28131 +SHA256 (rust/crates/winapi-0.3.9.crate) = 5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419 +SIZE (rust/crates/winapi-0.3.9.crate) = 1200382 +SHA256 (rust/crates/winapi-i686-pc-windows-gnu-0.4.0.crate) = ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6 +SIZE (rust/crates/winapi-i686-pc-windows-gnu-0.4.0.crate) = 2918815 +SHA256 (rust/crates/winapi-util-0.1.6.crate) = f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596 +SIZE (rust/crates/winapi-util-0.1.6.crate) = 12234 +SHA256 (rust/crates/winapi-x86_64-pc-windows-gnu-0.4.0.crate) = 712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f +SIZE (rust/crates/winapi-x86_64-pc-windows-gnu-0.4.0.crate) = 2947998 +SHA256 (rust/crates/windows-sys-0.45.0.crate) = 75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0 +SIZE (rust/crates/windows-sys-0.45.0.crate) = 2568659 +SHA256 (rust/crates/windows-sys-0.48.0.crate) = 677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9 +SIZE (rust/crates/windows-sys-0.48.0.crate) = 2628884 +SHA256 (rust/crates/windows-sys-0.52.0.crate) = 282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d +SIZE (rust/crates/windows-sys-0.52.0.crate) = 2576877 +SHA256 (rust/crates/windows-targets-0.42.2.crate) = 8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071 +SIZE (rust/crates/windows-targets-0.42.2.crate) = 5492 +SHA256 (rust/crates/windows-targets-0.48.5.crate) = 9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c +SIZE (rust/crates/windows-targets-0.48.5.crate) = 6904 +SHA256 (rust/crates/windows-targets-0.52.0.crate) = 8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd +SIZE (rust/crates/windows-targets-0.52.0.crate) = 6229 +SHA256 (rust/crates/windows_aarch64_gnullvm-0.42.2.crate) = 597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8 +SIZE (rust/crates/windows_aarch64_gnullvm-0.42.2.crate) = 364071 +SHA256 (rust/crates/windows_aarch64_gnullvm-0.48.5.crate) = 2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8 +SIZE (rust/crates/windows_aarch64_gnullvm-0.48.5.crate) = 418492 +SHA256 (rust/crates/windows_aarch64_gnullvm-0.52.0.crate) = cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea +SIZE (rust/crates/windows_aarch64_gnullvm-0.52.0.crate) = 430182 +SHA256 (rust/crates/windows_aarch64_msvc-0.42.2.crate) = e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43 +SIZE (rust/crates/windows_aarch64_msvc-0.42.2.crate) = 666981 +SHA256 (rust/crates/windows_aarch64_msvc-0.48.5.crate) = dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc +SIZE (rust/crates/windows_aarch64_msvc-0.48.5.crate) = 798483 +SHA256 (rust/crates/windows_aarch64_msvc-0.52.0.crate) = bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef +SIZE (rust/crates/windows_aarch64_msvc-0.52.0.crate) = 821663 +SHA256 (rust/crates/windows_i686_gnu-0.42.2.crate) = c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f +SIZE (rust/crates/windows_i686_gnu-0.42.2.crate) = 736236 +SHA256 (rust/crates/windows_i686_gnu-0.48.5.crate) = a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e +SIZE (rust/crates/windows_i686_gnu-0.48.5.crate) = 844891 +SHA256 (rust/crates/windows_i686_gnu-0.52.0.crate) = a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313 +SIZE (rust/crates/windows_i686_gnu-0.52.0.crate) = 870285 +SHA256 (rust/crates/windows_i686_msvc-0.42.2.crate) = 44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060 +SIZE (rust/crates/windows_i686_msvc-0.42.2.crate) = 724951 +SHA256 (rust/crates/windows_i686_msvc-0.48.5.crate) = 8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406 +SIZE (rust/crates/windows_i686_msvc-0.48.5.crate) = 864300 +SHA256 (rust/crates/windows_i686_msvc-0.52.0.crate) = ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a +SIZE (rust/crates/windows_i686_msvc-0.52.0.crate) = 888693 +SHA256 (rust/crates/windows_x86_64_gnu-0.42.2.crate) = 8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36 +SIZE (rust/crates/windows_x86_64_gnu-0.42.2.crate) = 699373 +SHA256 (rust/crates/windows_x86_64_gnu-0.48.5.crate) = 53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e +SIZE (rust/crates/windows_x86_64_gnu-0.48.5.crate) = 801619 +SHA256 (rust/crates/windows_x86_64_gnu-0.52.0.crate) = 3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd +SIZE (rust/crates/windows_x86_64_gnu-0.52.0.crate) = 826213 +SHA256 (rust/crates/windows_x86_64_gnullvm-0.42.2.crate) = 26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3 +SIZE (rust/crates/windows_x86_64_gnullvm-0.42.2.crate) = 364068 +SHA256 (rust/crates/windows_x86_64_gnullvm-0.48.5.crate) = 0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc +SIZE (rust/crates/windows_x86_64_gnullvm-0.48.5.crate) = 418486 +SHA256 (rust/crates/windows_x86_64_gnullvm-0.52.0.crate) = 1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e +SIZE (rust/crates/windows_x86_64_gnullvm-0.52.0.crate) = 430165 +SHA256 (rust/crates/windows_x86_64_msvc-0.42.2.crate) = 9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0 +SIZE (rust/crates/windows_x86_64_msvc-0.42.2.crate) = 666936 +SHA256 (rust/crates/windows_x86_64_msvc-0.48.5.crate) = ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538 +SIZE (rust/crates/windows_x86_64_msvc-0.48.5.crate) = 798412 +SHA256 (rust/crates/windows_x86_64_msvc-0.52.0.crate) = dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04 +SIZE (rust/crates/windows_x86_64_msvc-0.52.0.crate) = 821600 diff --git a/textproc/py-tokenizers/pkg-descr b/textproc/py-tokenizers/pkg-descr new file mode 100644 index 0000000000000..37ba482d53d27 --- /dev/null +++ b/textproc/py-tokenizers/pkg-descr @@ -0,0 +1,16 @@ +Provides an implementation of today's most used tokenizers, with a +focus on performance and versatility. + +Main features: +- Train new vocabularies and tokenize, using today's most used + tokenizers. +- Extremely fast (both training and tokenization), thanks to the Rust + implementation. Takes less than 20 seconds to tokenize a GB of text + on a server's CPU. +- Easy to use, but also extremely versatile. +- Designed for research and production. +- Normalization comes with alignments tracking. It's always possible + to get the part of the original sentence that corresponds to a given + token. +- Does all the pre-processing: Truncate, Pad, add the special tokens + your model needs.