Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion crates/bpe-openai/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ bench = false
[dependencies]
bpe = { version = "0.1.0", path = "../bpe" }
either = "1.13"
fancy-regex = "0.13"
regex-automata = "0.4"
rmp-serde = "1"

[dev-dependencies]
bpe = { version = "0.1.0", path = "../bpe", features = ["rand"] }
tiktoken-rs = "0.6"

[build-dependencies]
Expand Down
2 changes: 0 additions & 2 deletions crates/bpe-openai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ For convencience it re-exports the `bpe` crate so that depending on this crate i

Supported tokenizers:

- r50k
- p50k
- cl100k
- o200k

Expand Down
2 changes: 0 additions & 2 deletions crates/bpe-openai/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
use serde::Serialize;

fn main() {
serialize_tiktoken_bpe("r50k_base", include_bytes!("data/r50k_base.tiktoken.gz"), 1);
serialize_tiktoken_bpe("p50k_base", include_bytes!("data/p50k_base.tiktoken.gz"), 1);
serialize_tiktoken_bpe(
"cl100k_base",
include_bytes!("data/cl100k_base.tiktoken.gz"),
Expand Down
Binary file removed crates/bpe-openai/data/p50k_base.tiktoken.gz
Binary file not shown.
Binary file removed crates/bpe-openai/data/r50k_base.tiktoken.gz
Binary file not shown.
193 changes: 127 additions & 66 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,41 @@ use std::sync::LazyLock;

use bpe::byte_pair_encoding::BytePairEncoding;
use either::Either;
use fancy_regex::Regex;
use regex_automata::{
meta::{BuildError, Regex},
util::captures::Captures,
Anchored, Input,
};

static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});

static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});
// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
// The look-ahead character is dropped from the match by the Pretokenizer iterator.
// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!

static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
let pat2 = "\\s+\\s";
let pat3 = "\\s+";
Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)])
.expect("valid regex")
});

static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = [
let pat1 = [
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
"\\p{N}{1,3}",
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
"\\s*[\\r\\n]+",
"\\s+(?!\\S)",
"\\s+",
"\\s+$",
].join("|");
Tokenizer::new(bpe, Some(&pat)).expect("valid regex")
let pat2 = "\\s+\\s";
let pat3 = "\\s+";
Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)])
.expect("valid regex")
});

pub use bpe::*;
Expand All @@ -52,14 +51,33 @@ pub struct Tokenizer {
/// The byte-pair encoding for this tokenizer.
pub bpe: BytePairEncoding,
/// The pattern regex used to split the input.
pub pat: Option<Regex>,
pub pre: Option<Pretokenizer>,
}

pub struct Pretokenizer {
/// The pattern regex used to split the input.
pat: Regex,
/// For each pattern in the regex a boolean whether the last character is a look-ahead.
lookahead: Vec<bool>,
}

impl Tokenizer {
/// Build a tokenizer with an optional pretokenization regex pattern.
#[allow(clippy::result_large_err)]
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
let pat = pat.map(fancy_regex::Regex::new).transpose()?;
Ok(Self { bpe, pat })
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError> {
let pre = pat.map(Pretokenizer::new).transpose()?;
Ok(Self { bpe, pre })
}

/// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
/// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
#[allow(clippy::result_large_err)]
pub fn new_lookahead(
bpe: BytePairEncoding,
patterns: &[(&str, bool)],
) -> Result<Self, BuildError> {
let pre = Some(Pretokenizer::new_lookahead(patterns)?);
Ok(Self { bpe, pre })
}

pub fn count(&self, text: &str) -> usize {
Expand All @@ -79,24 +97,81 @@ impl Tokenizer {
}

pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
match &self.pat {
Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
let m = m.expect("match succeeded");
assert_eq!(*start, m.start(), "pattern should match all input text");
*start = m.end();
Some(m.as_str())
})),
match &self.pre {
Some(pre) => Either::Left(pre.split(text)),
None => Either::Right(std::iter::once(text)),
}
}
}

pub fn r50k_base() -> &'static Tokenizer {
&BPE_R50K_BASE
impl Pretokenizer {
/// Build a pretokenizer from the given regex pattern.
#[allow(clippy::result_large_err)]
fn new(pat: &str) -> Result<Self, BuildError> {
let pat = Regex::new(pat)?;
Ok(Self {
pat,
lookahead: vec![false],
})
}

/// Build a pretokenizer from the given regex patterns. If the boolean for a pattern is true,
/// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
#[allow(clippy::result_large_err)]
fn new_lookahead(pats: &[(&str, bool)]) -> Result<Self, BuildError> {
let (pats, lookahead): (Vec<_>, _) = pats.iter().copied().unzip();
let pat = Regex::new_many(&pats)?;
Ok(Self { pat, lookahead })
}

pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
Splits {
pat: &self.pat,
lookahead: &self.lookahead,
text,
last: 0,
caps: Captures::matches(self.pat.group_info().clone()),
}
}
}

/// This is a small wrapper around the regex which emulates the behaviour of look-ahead by
/// dropping the look-ahead character from the match. The assumption here is that the
/// second pattern is always a look-ahead pattern, and that just a single character needs
/// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
/// but achieve a >3x speedup.
///
/// Alternatively, this could have been implemented with capture groups, but those were ~30%
/// slower than this approach with multiple patterns.
struct Splits<'a> {
pat: &'a Regex,
lookahead: &'a [bool],
text: &'a str,
last: usize,
caps: Captures,
}

pub fn p50k_base() -> &'static Tokenizer {
&BPE_P50K_BASE
impl<'a> Iterator for Splits<'a> {
type Item = &'a str;

fn next(&mut self) -> Option<Self::Item> {
let input = Input::new(&self.text[self.last..]).anchored(Anchored::Yes);
self.caps.clear();
self.pat.captures(input, &mut self.caps);
let m = self.caps.get_match()?;
let start = self.last;
let mut end = self.last + m.range().end;
if self.lookahead[m.pattern().as_usize()] {
let last = self.text[start..end]
.chars()
.next_back()
.expect("Expected at least a look-ahead character!");
end -= last.len_utf8();
assert_ne!(end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!");
}
self.last = end;
Some(&self.text[start..end])
}
}

pub fn cl100k_base() -> &'static Tokenizer {
Expand All @@ -109,45 +184,31 @@ pub fn o200k_base() -> &'static Tokenizer {

#[cfg(test)]
mod tests {
use tiktoken_rs::cl100k_base_singleton;
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton, CoreBPE};

use super::*;

#[test]
fn can_load_r50k() {
r50k_base().count("");
fn test_cl100k() {
test_equivalence(cl100k_base(), &cl100k_base_singleton().lock());
}

#[test]
fn can_load_p50k() {
p50k_base().count("");
fn test_o200k() {
test_equivalence(o200k_base(), &o200k_base_singleton().lock());
}

#[test]
fn can_load_cl100k() {
cl100k_base().count("");
}

#[test]
fn can_load_o200k() {
o200k_base().count("");
}

/// Test demonstrating a case where input splitting makes a difference.
#[test]
fn splitting_difference() {
let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
let input = text.as_bytes();
let expected: Vec<_> = cl100k_base_singleton()
.lock()
.encode_ordinary(text)
.into_iter()
.collect();

let without_splitting = BPE_CL100K_BASE.bpe.encode_via_backtracking(input);
assert_ne!(without_splitting, expected);

let with_splitting: Vec<_> = BPE_CL100K_BASE.encode(text);
assert_eq!(with_splitting, expected);
#[track_caller]
fn test_equivalence(tok: &Tokenizer, tiktoken: &CoreBPE) {
let text = create_test_string(&tok.bpe, 80_000);
for bytes in [10, 100, 1000, 10_000] {
for _ in 0..32 {
let text = select_test_string(&text, bytes);
let tokens = tok.encode(text);
let tiktokens = tiktoken.encode_ordinary(text).to_vec();
assert_eq!(tokens, tiktokens, "encoding mismatch for {text:?}");
}
}
}
}
5 changes: 4 additions & 1 deletion crates/bpe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,10 @@ It does give a good indication of how the algorithms might perform in practice.

The graph below shows encoding runtime vs slice length.
All encoders show a similar runtime complexity.
The backtracking encoder and tiktoken have comparable performance, and both are about 3.5--4x faster than the Huggingface encoder.
The backtracking encoder is about 3x faster than tiktoken.
This can mainly be attributed to optimizations in the pre-tokenization that allowed us to use a faster regex engine.
Without those, their performance is comparable.
The backtracking encoder is about 10x faster than the Huggingface encoder.

An interesting observation here is that pre-tokenization slows down encoding quite a bit.
Compared with the encoding benchmark above, the backtracking encoder without pre-tokenization is almost 4x faster than the one with pre-tokenization in this benchmark.
Expand Down
55 changes: 21 additions & 34 deletions crates/bpe/benchmarks/equivalence.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
use bpe_benchmarks::*;

#[cfg(test)]
const N: usize = 32;

#[test]
fn test_encoding_equivalence_without_pretokenization() {
fn test_huggingface_encoding_equivalence_without_pretokenization() {
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
let huggingface = without_pretokenizer(huggingface);
let text = create_test_string(&bpe.bpe, 20000);
let inputs = (0..N)
.map(|_| select_test_bytes(text.as_bytes(), 100))
let text = create_test_string(&bpe.bpe, 80_000);
let texts = (0..N)
.map(|_| select_test_string(&text, 100))
.chain(std::iter::once(
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
"You should see the Greek word 'kosme': \"κόσμε\"",
));
for input in inputs {
let text = std::str::from_utf8(input).unwrap();
let out = bpe.bpe.encode_via_backtracking(input);
for text in texts {
let out = bpe.bpe.encode_via_backtracking(text.as_bytes());
let huggingface_out = huggingface
.encode_fast(text, false)
.unwrap()
Expand All @@ -41,48 +41,35 @@ fn test_encoding_equivalence_without_pretokenization() {
}

#[test]
fn test_encoding_equivalence_with_pretokenization() {
for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
let text = create_test_string(&bpe.bpe, 20000);
let inputs = (0..N)
.map(|_| select_test_bytes(text.as_bytes(), 100))
fn test_huggingface_encoding_equivalence_with_pretokenization() {
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
let text = create_test_string(&bpe.bpe, 80_000);
let texts = (0..N)
.map(|_| select_test_string(&text, 100))
.chain(std::iter::once(
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
"You should see the Greek word 'kosme': \"κόσμε\" ",
));
for input in inputs {
let text = std::str::from_utf8(input).unwrap();
for text in texts {
let out = bpe.encode(text);
let tiktoken_out = tiktoken.encode_ordinary(text);
let tiktoken_out2 = tiktoken_out.to_vec();
let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
let huggingface_out = huggingface
.encode_fast(text, false)
.unwrap()
.get_ids()
.to_vec();
if tiktoken_out2 != huggingface_out {

if huggingface_out != out {
let text = bpe.decode(&out).unwrap();
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
if tiktoken_text != huggingface_text {
if huggingface_text != text {
panic!(
"huggingface tokens and text differ: {:?} != {:?}",
huggingface_text, tiktoken_text
text, huggingface_text
);
} else {
panic!(
"huggingface tokens differ: {:?} != {:?}",
huggingface_out, tiktoken_out2
);
}
}
if tiktoken_out2 != out {
let text = bpe.decode(&out).unwrap();
if tiktoken_text != text {
panic!(
"bpe tokens and text differ: {:?} != {:?}",
text, tiktoken_text
out, huggingface_out
);
} else {
panic!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2);
}
}
}
Expand Down
Loading