github · hendrikvanantwerpen · Oct 21, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
@@ -15,10 +15,11 @@ bench = false
 [dependencies]
 bpe = { version = "0.1.0", path = "../bpe" }
 either = "1.13"
-fancy-regex = "0.13"
+regex-automata = "0.4"
 rmp-serde = "1"
 
 [dev-dependencies]
+bpe = { version = "0.1.0", path = "../bpe", features = ["rand"] }
 tiktoken-rs = "0.6"
 
 [build-dependencies]

@@ -7,8 +7,6 @@ For convencience it re-exports the `bpe` crate so that depending on this crate i
 
 Supported tokenizers:
 
-- r50k
-- p50k
 - cl100k
 - o200k
 

@@ -7,8 +7,6 @@ use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
 use serde::Serialize;
 
 fn main() {
-    serialize_tiktoken_bpe("r50k_base", include_bytes!("data/r50k_base.tiktoken.gz"), 1);
-    serialize_tiktoken_bpe("p50k_base", include_bytes!("data/p50k_base.tiktoken.gz"), 1);
     serialize_tiktoken_bpe(
         "cl100k_base",
         include_bytes!("data/cl100k_base.tiktoken.gz"),

@@ -2,42 +2,41 @@ use std::sync::LazyLock;
 
 use bpe::byte_pair_encoding::BytePairEncoding;
 use either::Either;
-use fancy_regex::Regex;
+use regex_automata::{
+    meta::{BuildError, Regex},
+    util::captures::Captures,
+    Anchored, Input,
+};
 
-static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
-    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
-    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
-    let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
-    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
-});
-
-static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
-    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
-    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
-    let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
-    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
-});
+// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
+// The look-ahead character is dropped from the match by the Pretokenizer iterator.
+// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
 
 static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
-    let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
-    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
+    let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
+    let pat2 = "\\s+\\s";
+    let pat3 = "\\s+";
+    Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)])
+        .expect("valid regex")
 });
 
 static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
-    let pat = [
+    let pat1 = [
         "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
         "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
         "\\p{N}{1,3}",
         " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
         "\\s*[\\r\\n]+",
-        "\\s+(?!\\S)",
-        "\\s+",
+        "\\s+$",
     ].join("|");
-    Tokenizer::new(bpe, Some(&pat)).expect("valid regex")
+    let pat2 = "\\s+\\s";
+    let pat3 = "\\s+";
+    Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)])
+        .expect("valid regex")
 });
 
 pub use bpe::*;
@@ -52,14 +51,33 @@ pub struct Tokenizer {
     /// The byte-pair encoding for this tokenizer.
     pub bpe: BytePairEncoding,
     /// The pattern regex used to split the input.
-    pub pat: Option<Regex>,
+    pub pre: Option<Pretokenizer>,
+}
+
+pub struct Pretokenizer {
+    /// The pattern regex used to split the input.
+    pat: Regex,
+    /// For each pattern in the regex a boolean whether the last character is a look-ahead.
+    lookahead: Vec<bool>,
 }
 
 impl Tokenizer {
+    /// Build a tokenizer with an optional pretokenization regex pattern.
     #[allow(clippy::result_large_err)]
-    pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
-        let pat = pat.map(fancy_regex::Regex::new).transpose()?;
-        Ok(Self { bpe, pat })
+    pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError> {
+        let pre = pat.map(Pretokenizer::new).transpose()?;
+        Ok(Self { bpe, pre })
+    }
+
+    /// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
+    /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
+    #[allow(clippy::result_large_err)]
+    pub fn new_lookahead(
+        bpe: BytePairEncoding,
+        patterns: &[(&str, bool)],
+    ) -> Result<Self, BuildError> {
+        let pre = Some(Pretokenizer::new_lookahead(patterns)?);
+        Ok(Self { bpe, pre })
     }
 
     pub fn count(&self, text: &str) -> usize {
@@ -79,24 +97,81 @@ impl Tokenizer {
     }
 
     pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
-        match &self.pat {
-            Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
-                let m = m.expect("match succeeded");
-                assert_eq!(*start, m.start(), "pattern should match all input text");
-                *start = m.end();
-                Some(m.as_str())
-            })),
+        match &self.pre {
+            Some(pre) => Either::Left(pre.split(text)),
             None => Either::Right(std::iter::once(text)),
         }
     }
 }
 
-pub fn r50k_base() -> &'static Tokenizer {
-    &BPE_R50K_BASE
+impl Pretokenizer {
+    /// Build a pretokenizer from the given regex pattern.
+    #[allow(clippy::result_large_err)]
+    fn new(pat: &str) -> Result<Self, BuildError> {
+        let pat = Regex::new(pat)?;
+        Ok(Self {
+            pat,
+            lookahead: vec![false],
+        })
+    }
+
+    /// Build a pretokenizer from the given regex patterns. If the boolean for a pattern is true,
+    /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
+    #[allow(clippy::result_large_err)]
+    fn new_lookahead(pats: &[(&str, bool)]) -> Result<Self, BuildError> {
+        let (pats, lookahead): (Vec<_>, _) = pats.iter().copied().unzip();
+        let pat = Regex::new_many(&pats)?;
+        Ok(Self { pat, lookahead })
+    }
+
+    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
+        Splits {
+            pat: &self.pat,
+            lookahead: &self.lookahead,
+            text,
+            last: 0,
+            caps: Captures::matches(self.pat.group_info().clone()),
+        }
+    }
+}
+
+/// This is a small wrapper around the regex which emulates the behaviour of look-ahead by
+/// dropping the look-ahead character from the match. The assumption here is that the
+/// second pattern is always a look-ahead pattern, and that just a single character needs
+/// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
+/// but achieve a >3x speedup.
+///
+/// Alternatively, this could have been implemented with capture groups, but those were ~30%
+/// slower than this approach with multiple patterns.
+struct Splits<'a> {
+    pat: &'a Regex,
+    lookahead: &'a [bool],
+    text: &'a str,
+    last: usize,
+    caps: Captures,
 }
 
-pub fn p50k_base() -> &'static Tokenizer {
-    &BPE_P50K_BASE
+impl<'a> Iterator for Splits<'a> {
+    type Item = &'a str;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let input = Input::new(&self.text[self.last..]).anchored(Anchored::Yes);
+        self.caps.clear();
+        self.pat.captures(input, &mut self.caps);
+        let m = self.caps.get_match()?;
+        let start = self.last;
+        let mut end = self.last + m.range().end;
+        if self.lookahead[m.pattern().as_usize()] {
+            let last = self.text[start..end]
+                .chars()
+                .next_back()
+                .expect("Expected at least a look-ahead character!");
+            end -= last.len_utf8();
+            assert_ne!(end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!");
+        }
+        self.last = end;
+        Some(&self.text[start..end])
+    }
 }
 
 pub fn cl100k_base() -> &'static Tokenizer {
@@ -109,45 +184,31 @@ pub fn o200k_base() -> &'static Tokenizer {
 
 #[cfg(test)]
 mod tests {
-    use tiktoken_rs::cl100k_base_singleton;
+    use bpe::byte_pair_encoding::{create_test_string, select_test_string};
+    use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton, CoreBPE};
 
     use super::*;
 
     #[test]
-    fn can_load_r50k() {
-        r50k_base().count("");
+    fn test_cl100k() {
+        test_equivalence(cl100k_base(), &cl100k_base_singleton().lock());
     }
 
     #[test]
-    fn can_load_p50k() {
-        p50k_base().count("");
+    fn test_o200k() {
+        test_equivalence(o200k_base(), &o200k_base_singleton().lock());
     }
 
-    #[test]
-    fn can_load_cl100k() {
-        cl100k_base().count("");
-    }
-
-    #[test]
-    fn can_load_o200k() {
-        o200k_base().count("");
-    }
-
-    /// Test demonstrating a case where input splitting makes a difference.
-    #[test]
-    fn splitting_difference() {
-        let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t    Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
-        let input = text.as_bytes();
-        let expected: Vec<_> = cl100k_base_singleton()
-            .lock()
-            .encode_ordinary(text)
-            .into_iter()
-            .collect();
-
-        let without_splitting = BPE_CL100K_BASE.bpe.encode_via_backtracking(input);
-        assert_ne!(without_splitting, expected);
-
-        let with_splitting: Vec<_> = BPE_CL100K_BASE.encode(text);
-        assert_eq!(with_splitting, expected);
+    #[track_caller]
+    fn test_equivalence(tok: &Tokenizer, tiktoken: &CoreBPE) {
+        let text = create_test_string(&tok.bpe, 80_000);
+        for bytes in [10, 100, 1000, 10_000] {
+            for _ in 0..32 {
+                let text = select_test_string(&text, bytes);
+                let tokens = tok.encode(text);
+                let tiktokens = tiktoken.encode_ordinary(text).to_vec();
+                assert_eq!(tokens, tiktokens, "encoding mismatch for {text:?}");
+            }
+        }
     }
 }
@@ -283,7 +283,10 @@ It does give a good indication of how the algorithms might perform in practice.
 
 The graph below shows encoding runtime vs slice length.
 All encoders show a similar runtime complexity.
-The backtracking encoder and tiktoken have comparable performance, and both are about 3.5--4x faster than the Huggingface encoder.
+The backtracking encoder is about 3x faster than tiktoken.
+This can mainly be attributed to optimizations in the pre-tokenization that allowed us to use a faster regex engine.
+Without those, their performance is comparable.
+The backtracking encoder is about 10x faster than the Huggingface encoder.
 
 An interesting observation here is that pre-tokenization slows down encoding quite a bit.
 Compared with the encoding benchmark above, the backtracking encoder without pre-tokenization is almost 4x faster than the one with pre-tokenization in this benchmark.

@@ -1,21 +1,21 @@
+use bpe::byte_pair_encoding::{create_test_string, select_test_string};
 use bpe_benchmarks::*;
 
 #[cfg(test)]
 const N: usize = 32;
 
 #[test]
-fn test_encoding_equivalence_without_pretokenization() {
+fn test_huggingface_encoding_equivalence_without_pretokenization() {
     for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
         let huggingface = without_pretokenizer(huggingface);
-        let text = create_test_string(&bpe.bpe, 20000);
-        let inputs = (0..N)
-            .map(|_| select_test_bytes(text.as_bytes(), 100))
+        let text = create_test_string(&bpe.bpe, 80_000);
+        let texts = (0..N)
+            .map(|_| select_test_string(&text, 100))
             .chain(std::iter::once(
-                "You should see the Greek word 'kosme':       \"κόσμε\"".as_bytes(),
+                "You should see the Greek word 'kosme':       \"κόσμε\"",
             ));
-        for input in inputs {
-            let text = std::str::from_utf8(input).unwrap();
-            let out = bpe.bpe.encode_via_backtracking(input);
+        for text in texts {
+            let out = bpe.bpe.encode_via_backtracking(text.as_bytes());
             let huggingface_out = huggingface
                 .encode_fast(text, false)
                 .unwrap()
@@ -41,48 +41,35 @@ fn test_encoding_equivalence_without_pretokenization() {
 }
 
 #[test]
-fn test_encoding_equivalence_with_pretokenization() {
-    for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
-        let text = create_test_string(&bpe.bpe, 20000);
-        let inputs = (0..N)
-            .map(|_| select_test_bytes(text.as_bytes(), 100))
+fn test_huggingface_encoding_equivalence_with_pretokenization() {
+    for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
+        let text = create_test_string(&bpe.bpe, 80_000);
+        let texts = (0..N)
+            .map(|_| select_test_string(&text, 100))
             .chain(std::iter::once(
-                "You should see the Greek word 'kosme':       \"κόσμε\"".as_bytes(),
+                "You should see the Greek word 'kosme':       \"κόσμε\"   ",
             ));
-        for input in inputs {
-            let text = std::str::from_utf8(input).unwrap();
+        for text in texts {
             let out = bpe.encode(text);
-            let tiktoken_out = tiktoken.encode_ordinary(text);
-            let tiktoken_out2 = tiktoken_out.to_vec();
-            let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
             let huggingface_out = huggingface
                 .encode_fast(text, false)
                 .unwrap()
                 .get_ids()
                 .to_vec();
-            if tiktoken_out2 != huggingface_out {
+
+            if huggingface_out != out {
+                let text = bpe.decode(&out).unwrap();
                 let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
-                if tiktoken_text != huggingface_text {
+                if huggingface_text != text {
                     panic!(
                         "huggingface tokens and text differ: {:?} != {:?}",
-                        huggingface_text, tiktoken_text
+                        text, huggingface_text
                     );
                 } else {
                     panic!(
                         "huggingface tokens differ: {:?} != {:?}",
-                        huggingface_out, tiktoken_out2
-                    );
-                }
-            }
-            if tiktoken_out2 != out {
-                let text = bpe.decode(&out).unwrap();
-                if tiktoken_text != text {
-                    panic!(
-                        "bpe tokens and text differ: {:?} != {:?}",
-                        text, tiktoken_text
+                        out, huggingface_out
                     );
-                } else {
-                    panic!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2);
                 }
             }
         }
-Original file line number
+Diff line change
@@ Expand Up @@
     Supported tokenizers:
-    - r50k
-    - p50k
     - cl100k
     - o200k
@@ Expand Down @@