From 8bb37e3bc6553aeb4f1d1bb55492633662d363f0 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Mon, 30 Sep 2024 19:09:09 +0200 Subject: [PATCH 01/15] Reorganize bpe benchmark --- crates/bpe/benches/counting.rs | 145 ++++++++++++++------ criterion.toml => crates/bpe/criterion.toml | 0 crates/geo_filters/criterion.toml | 2 + 3 files changed, 104 insertions(+), 43 deletions(-) rename criterion.toml => crates/bpe/criterion.toml (100%) create mode 100644 crates/geo_filters/criterion.toml diff --git a/crates/bpe/benches/counting.rs b/crates/bpe/benches/counting.rs index 9b746d3..9bb48ec 100644 --- a/crates/bpe/benches/counting.rs +++ b/crates/bpe/benches/counting.rs @@ -1,92 +1,112 @@ +use std::sync::LazyLock; use std::time::Duration; +use bpe::appendable_encoder::AppendableEncoder; use bpe::byte_pair_encoding::{create_test_bytes, BytePairEncoding}; use bpe::interval_encoding::IntervalEncoding; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{ + criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration, +}; use rand::{thread_rng, Rng}; +use tiktoken_rs::CoreBPE; + +static TOKENIZERS: LazyLock<[(&'static str, &'static BytePairEncoding, CoreBPE); 2]> = + LazyLock::new(|| { + [ + ( + "cl100k", + BytePairEncoding::cl100k(), + tiktoken_rs::cl100k_base().unwrap(), + ), + ( + "o200k", + BytePairEncoding::o200k(), + tiktoken_rs::o200k_base().unwrap(), + ), + ] + }); fn counting_benchmark(c: &mut Criterion) { - for (name, bpe) in [ - ("cl100k", BytePairEncoding::cl100k()), - ("o200k", BytePairEncoding::o200k()), - ] { - let text = create_test_bytes(&bpe, 20000); - let fast = IntervalEncoding::new(&bpe, &text); + for (name, bpe, _) in TOKENIZERS.iter() { + let input = create_test_bytes(&bpe, 20000); + let fast = IntervalEncoding::new(&bpe, &input); + let mut group = c.benchmark_group(format!("counting-{name}")); + group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); for bytes in [10, 100, 1000, 10000] { - let mut group = c.benchmark_group(format!("bpe-{name}-bytes-{bytes}")); - group.bench_function("hybrid counting", |b| { + group.throughput(criterion::Throughput::Bytes(bytes as u64)); + group.bench_with_input(BenchmarkId::new("interval", bytes), &bytes, |b, bytes| { b.iter_batched( - || thread_rng().gen_range(0..text.len() - bytes), + || thread_rng().gen_range(0..input.len() - bytes), |start| fast.count(start..start + bytes), criterion::BatchSize::SmallInput, ) }); - group.bench_function("backtrack counting", |b| { - b.iter_batched( - || thread_rng().gen_range(0..text.len() - bytes), - |start| bpe.count(&text[start..start + bytes]), - criterion::BatchSize::SmallInput, - ) - }); + group.bench_with_input( + BenchmarkId::new("backtracking", bytes), + &bytes, + |b, bytes| { + b.iter_batched( + || thread_rng().gen_range(0..input.len() - bytes), + |start| bpe.count(&input[start..start + bytes]), + criterion::BatchSize::SmallInput, + ) + }, + ); } + group.finish(); } } fn encoding_benchmark(c: &mut Criterion) { - for (name, bpe, tiktoken) in [ - ( - "cl100k", - BytePairEncoding::cl100k(), - tiktoken_rs::cl100k_base().unwrap(), - ), - ( - "o200k", - BytePairEncoding::o200k(), - tiktoken_rs::o200k_base().unwrap(), - ), - ] { + for (name, bpe, tiktoken) in TOKENIZERS.iter() { let text = create_test_string(&bpe, 20000); let input = text.as_bytes(); + let mut group = c.benchmark_group(format!("encoding-{name}")); + group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); for bytes in [10, 100, 1000, 10000] { - let mut group = c.benchmark_group(format!("bpe-{name}-bytes-{bytes}")); - group.bench_function("backtracking", |b| { - b.iter_batched( - || thread_rng().gen_range(0..input.len() - bytes), - |start| bpe.encode_via_backtracking(&input[start..start + bytes]), - criterion::BatchSize::SmallInput, - ) - }); - group.bench_function("heap", |b| { + group.throughput(criterion::Throughput::Bytes(bytes as u64)); + group.bench_with_input( + BenchmarkId::new("backtracking", bytes), + &bytes, + |b, bytes| { + b.iter_batched( + || thread_rng().gen_range(0..input.len() - bytes), + |start| bpe.encode_via_backtracking(&input[start..start + bytes]), + criterion::BatchSize::SmallInput, + ) + }, + ); + group.bench_with_input(BenchmarkId::new("heap", bytes), &bytes, |b, bytes| { b.iter_batched( || thread_rng().gen_range(0..input.len() - bytes), |start| bpe.encode_via_bitfield(&input[start..start + bytes]), criterion::BatchSize::SmallInput, ) }); - group.bench_function("dynamic programming", |b| { + group.bench_with_input(BenchmarkId::new("table", bytes), &bytes, |b, bytes| { b.iter_batched( || thread_rng().gen_range(0..input.len() - bytes), |start| bpe.encode_via_table(&input[start..start + bytes]), criterion::BatchSize::SmallInput, ) }); - group.bench_function("greedy", |b| { + group.bench_with_input(BenchmarkId::new("greedy", bytes), &bytes, |b, bytes| { b.iter_batched( || thread_rng().gen_range(0..input.len() - bytes), |start| bpe.encode_greedy(&input[start..start + bytes]), criterion::BatchSize::SmallInput, ) }); - group.bench_function("minimal", |b| { + group.bench_with_input(BenchmarkId::new("minimal", bytes), &bytes, |b, bytes| { b.iter_batched( || thread_rng().gen_range(0..input.len() - bytes), |start| bpe.encode_minimal(&input[start..start + bytes]), criterion::BatchSize::SmallInput, ) }); - group.bench_function("tiktoken", |b| { + group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| { b.iter_batched( || loop { let start = thread_rng().gen_range(0..input.len() - bytes - 1); @@ -100,6 +120,45 @@ fn encoding_benchmark(c: &mut Criterion) { ) }); } + group.finish(); + } +} + +fn appending_benchmark(c: &mut Criterion) { + for (name, bpe, _) in TOKENIZERS.iter() { + let input = create_test_bytes(&bpe, 20000); + + let mut group = c.benchmark_group(format!("appending-{name}")); + group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + for bytes in [10, 100, 1000, 10000] { + group.throughput(criterion::Throughput::Bytes(bytes as u64)); + group.bench_with_input(BenchmarkId::new("appending", bytes), &bytes, |b, bytes| { + b.iter_batched( + || { + ( + thread_rng().gen_range(0..input.len() - bytes), + AppendableEncoder::new(bpe), + ) + }, + |(start, mut enc)| { + enc.extend(input[start..start + bytes].into_iter().copied()) + }, + criterion::BatchSize::SmallInput, + ) + }); + group.bench_with_input( + BenchmarkId::new("backtracking", bytes), + &bytes, + |b, bytes| { + b.iter_batched( + || thread_rng().gen_range(0..input.len() - bytes), + |start| bpe.count(&input[start..start + bytes]), + criterion::BatchSize::SmallInput, + ) + }, + ); + } + group.finish(); } } @@ -134,6 +193,6 @@ fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String { criterion_group!( name = benches; config = Criterion::default().warm_up_time(Duration::from_millis(500)).measurement_time(Duration::from_millis(500)).nresamples(1000); - targets = counting_benchmark, encoding_benchmark + targets = counting_benchmark, encoding_benchmark, appending_benchmark ); criterion_main!(benches); diff --git a/criterion.toml b/crates/bpe/criterion.toml similarity index 100% rename from criterion.toml rename to crates/bpe/criterion.toml diff --git a/crates/geo_filters/criterion.toml b/crates/geo_filters/criterion.toml new file mode 100644 index 0000000..c0f42f2 --- /dev/null +++ b/crates/geo_filters/criterion.toml @@ -0,0 +1,2 @@ +# save report in this directory, even if a custom target directory is set +criterion_home = "./target/criterion" From 641d546c60c7d57917de1b2e5af497204b6e4436 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Tue, 1 Oct 2024 11:41:41 +0200 Subject: [PATCH 02/15] Rename benchmark --- crates/bpe/Cargo.toml | 4 ++-- crates/bpe/benches/{counting.rs => performance.rs} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename crates/bpe/benches/{counting.rs => performance.rs} (100%) diff --git a/crates/bpe/Cargo.toml b/crates/bpe/Cargo.toml index 4050236..3e2e190 100644 --- a/crates/bpe/Cargo.toml +++ b/crates/bpe/Cargo.toml @@ -8,8 +8,8 @@ crate-type = ["lib", "staticlib"] bench = false [[bench]] -name = "counting" -path = "benches/counting.rs" +name = "performance" +path = "benches/performance.rs" harness = false [features] diff --git a/crates/bpe/benches/counting.rs b/crates/bpe/benches/performance.rs similarity index 100% rename from crates/bpe/benches/counting.rs rename to crates/bpe/benches/performance.rs From 81a119fb358b12c8cacc9ec9308221d5d67a8811 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Tue, 1 Oct 2024 12:00:07 +0200 Subject: [PATCH 03/15] Markdown warnings --- crates/bpe/README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index 0042795..3fc5e8d 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -4,6 +4,7 @@ The main purpose of this library is to provide fast and correct token counting f As a by-product, it can also be used to efficiently encode those chunks if desired. For chunking the following operations are of interest: + 1) Split text after exactly n tokens at a character boundary. 1) Count tokens for sub-ranges of a text. 1) Incrementally count tokens while appending text to a chunk. @@ -29,6 +30,7 @@ This library presents novel algorithms to compute BPE encodings which address th ## Prior Art There are mostly three strategies for BPE encoding. + 1) Trivial solution. Search brute force for the most frequent pair in the encoded text according the dictionary and replace those occurrences. This has a `O(n^2)` complexity and is therefore not very appealing in production. 2) Heap based. Set up a heap with the frequencies. This improves the linear search time to a logarithmic factor. If done properly, the overall complexity reduces now to `O(n log n)`. 3) Split the input into sections of a maximum size first and then process each section individually. This shrinks in theory the complexity to `O(n)` if the section size is small enough. But it will in general produce now different results. In order to produce the "correct" encoding, one would need to choose split points at token boundaries. But without having the text encoded already, this is in general impossible. @@ -89,13 +91,13 @@ If BPE wants to make a different merge decision when it sees the full input, the Given a valid encoding sequence `e_0..e_i` and a valid encoding tuple `e_i e_j`, then `e_0..e_i e_j` is also a valid encoding sequence. - ## Novel Algorithm At a first glance, it seems impossible to achieve `O(n)` complexity while preserving the encoding output of the original BPE algorithm, since the original BPE algorithm needs to first scan the full input before it can make any encoding decision. For instance, the sequence `abab` would be encoded as `ab ab` when the dictionary contains the tokens `a b ab ba bc abc babc ababc` ordered by frequency. But appending a single character `ababc` would result in a pretty different tokenization: `ab a bc`. So without looking ahead it seems impossible to properly tokenize the text. The solution is to track the encodings of ALL text prefixes. For our example `ababc` we would get: + - `a` ------> `a` - `ab` -----> `ab` - `aba` ----> `ab a` @@ -136,6 +138,7 @@ Once that happens the reencoding will be different and the algorithm can stop. The actual implementation needs essentially at most 14 lookups for the most complex cases to determine whether two tokens are compatible or not. Putting all these pieces together leads to the following algorithmic sketch: + ```rust let last_tokens = vec![]; for pos in 0..text.len() { @@ -166,6 +169,7 @@ The main observation is that often the greedy heuristic picks already the correc In the cases, where it doesn't the algorithm has to somehow backtrack to the next tokenization until it converged to the correct solution. Our backtracking implementation solves the enumeration problem as follows: + 1) If the current tokenization sequence is valid, then append the longest matching token to the right. 2) Otherwise, replace the right most token with the next longest prefix token. 3) If there is no such token, then remove that token and go back to step 2. @@ -193,4 +197,4 @@ We compared our implementations with the tiktoken implementation on a MacBook Pr As can be seen, our Backtracking implementation beats the TikToken Rust implementation by ~4x. And even the fully dynamic programming solution is faster with a more consistent runtime. The tuned heap implementation is still quite competitive to TikToken (especially for smaller inputs). -If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners. \ No newline at end of file +If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners. From eaf4f7f47dfcf58418c2fe438d9956ffe4fd8b0a Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Tue, 1 Oct 2024 12:37:18 +0200 Subject: [PATCH 04/15] Add figures to README --- crates/bpe/.gitignore | 10 ++++++++++ crates/bpe/README.md | 18 ++++++++++++++++++ crates/bpe/benches/performance.rs | 4 +--- .../result/reports/appending-o200k/lines.svg | 0 .../result/reports/counting-o200k/lines.svg | 0 .../result/reports/encoding-o200k/lines.svg | 0 crates/bpe/criterion.toml | 2 +- 7 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 crates/bpe/.gitignore create mode 100644 crates/bpe/benches/result/reports/appending-o200k/lines.svg create mode 100644 crates/bpe/benches/result/reports/counting-o200k/lines.svg create mode 100644 crates/bpe/benches/result/reports/encoding-o200k/lines.svg diff --git a/crates/bpe/.gitignore b/crates/bpe/.gitignore new file mode 100644 index 0000000..da6881e --- /dev/null +++ b/crates/bpe/.gitignore @@ -0,0 +1,10 @@ +# Ignore benchmark results except figures references in the README. +# Negated ignore patterns do not work for files inside a directory that is itself ignored. +# Therefore ignore using `**` and then negate the nested directories (but not the files inside). +/benches/result/** +!/benches/result/*/ +!/benches/result/*/*/ +# Negate the actual figures we want to keep. +!/benches/result/reports/counting-o200k/lines.svg +!/benches/result/reports/encoding-o200k/lines.svg +!/benches/result/reports/appending-o200k/lines.svg diff --git a/crates/bpe/README.md b/crates/bpe/README.md index 3fc5e8d..b8349a2 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -198,3 +198,21 @@ As can be seen, our Backtracking implementation beats the TikToken Rust implemen And even the fully dynamic programming solution is faster with a more consistent runtime. The tuned heap implementation is still quite competitive to TikToken (especially for smaller inputs). If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners. + +### Counting results + +Results for counting o200k tokens for random 10000 byte slices. The setup time of the interval encoder is comparable to backtracking. After setup counting of slices of the original data are approximately constant time. + +![Counting o200k tokens for random 10000 byte slices](./benches/result/reports/counting-o200k/lines.svg) + +### Encoding results + +Results for encoding o200k tokens for random 1000 bytes. The backtracking encoder consistently outperforms tiktoken by a constant factor. + +![Encoding o200k tokens for 10000 random bytes](./benches/result/reports/encoding-o200k/lines.svg) + +### Incremental encoding results + +Results for incrementally encoding o200k tokens by appending 10000 random bytes. The appending encoder is slower by a constant factor but overall has similar performance curve as the backtracking encoder encoding all data at once. + +![Incrementally encoding o200k tokens by appending 10000 random bytes](./benches/result/reports/appending-o200k/lines.svg) diff --git a/crates/bpe/benches/performance.rs b/crates/bpe/benches/performance.rs index 9bb48ec..ef428e1 100644 --- a/crates/bpe/benches/performance.rs +++ b/crates/bpe/benches/performance.rs @@ -140,9 +140,7 @@ fn appending_benchmark(c: &mut Criterion) { AppendableEncoder::new(bpe), ) }, - |(start, mut enc)| { - enc.extend(input[start..start + bytes].into_iter().copied()) - }, + |(start, mut enc)| enc.extend(input[start..start + bytes].into_iter().copied()), criterion::BatchSize::SmallInput, ) }); diff --git a/crates/bpe/benches/result/reports/appending-o200k/lines.svg b/crates/bpe/benches/result/reports/appending-o200k/lines.svg new file mode 100644 index 0000000..e69de29 diff --git a/crates/bpe/benches/result/reports/counting-o200k/lines.svg b/crates/bpe/benches/result/reports/counting-o200k/lines.svg new file mode 100644 index 0000000..e69de29 diff --git a/crates/bpe/benches/result/reports/encoding-o200k/lines.svg b/crates/bpe/benches/result/reports/encoding-o200k/lines.svg new file mode 100644 index 0000000..e69de29 diff --git a/crates/bpe/criterion.toml b/crates/bpe/criterion.toml index c0f42f2..ada40f9 100644 --- a/crates/bpe/criterion.toml +++ b/crates/bpe/criterion.toml @@ -1,2 +1,2 @@ # save report in this directory, even if a custom target directory is set -criterion_home = "./target/criterion" +criterion_home = "./benches/result" From 32d4c76a20dbf98cd14f35fa95cfecc367e3ddcc Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Tue, 1 Oct 2024 13:32:14 +0200 Subject: [PATCH 05/15] Update figures --- crates/bpe/README.md | 6 +- crates/bpe/benches/performance.rs | 5 +- .../result/reports/appending-o200k/lines.svg | 232 +++++++++++++ .../result/reports/counting-o200k/lines.svg | 217 ++++++++++++ .../result/reports/encoding-o200k/lines.svg | 316 ++++++++++++++++++ 5 files changed, 772 insertions(+), 4 deletions(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index b8349a2..41e1993 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -203,16 +203,16 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac Results for counting o200k tokens for random 10000 byte slices. The setup time of the interval encoder is comparable to backtracking. After setup counting of slices of the original data are approximately constant time. -![Counting o200k tokens for random 10000 byte slices](./benches/result/reports/counting-o200k/lines.svg) +![counting runtime comparison](./benches/result/reports/counting-o200k/lines.svg) ### Encoding results Results for encoding o200k tokens for random 1000 bytes. The backtracking encoder consistently outperforms tiktoken by a constant factor. -![Encoding o200k tokens for 10000 random bytes](./benches/result/reports/encoding-o200k/lines.svg) +![encoding runtime comparison](./benches/result/reports/encoding-o200k/lines.svg) ### Incremental encoding results Results for incrementally encoding o200k tokens by appending 10000 random bytes. The appending encoder is slower by a constant factor but overall has similar performance curve as the backtracking encoder encoding all data at once. -![Incrementally encoding o200k tokens by appending 10000 random bytes](./benches/result/reports/appending-o200k/lines.svg) +![appending runtime comparison](./benches/result/reports/appending-o200k/lines.svg) diff --git a/crates/bpe/benches/performance.rs b/crates/bpe/benches/performance.rs index ef428e1..336c4a2 100644 --- a/crates/bpe/benches/performance.rs +++ b/crates/bpe/benches/performance.rs @@ -190,7 +190,10 @@ fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String { criterion_group!( name = benches; - config = Criterion::default().warm_up_time(Duration::from_millis(500)).measurement_time(Duration::from_millis(500)).nresamples(1000); + config = Criterion::default() + .warm_up_time(Duration::from_millis(500)) + .measurement_time(Duration::from_millis(1000)) + .nresamples(1000); targets = counting_benchmark, encoding_benchmark, appending_benchmark ); criterion_main!(benches); diff --git a/crates/bpe/benches/result/reports/appending-o200k/lines.svg b/crates/bpe/benches/result/reports/appending-o200k/lines.svg index e69de29..c114d21 100644 --- a/crates/bpe/benches/result/reports/appending-o200k/lines.svg +++ b/crates/bpe/benches/result/reports/appending-o200k/lines.svg @@ -0,0 +1,232 @@ + + + +Gnuplot +Produced by GNUPLOT 6.0 patchlevel 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.0001 + + + + + + + + + + + + + 0.001 + + + + + + + + + + + + + 0.01 + + + + + + + + + + + + + 0.1 + + + + + + + + + + + + + 1 + + + + + + + + + + + + + 10 + + + + + 10 + + + + + 100 + + + + + 1000 + + + + + 10000 + + + + + + + + + appending + + + + + appending + + + + + + gnuplot_plot_2 + + + + + + + + + + backtracking + + + + + backtracking + + + + + + gnuplot_plot_4 + + + + + + + + + + + + + + + + + + Average time (ms) + + + + + Input Size (Bytes) + + + + + + + appending-o200k: Comparison + + + + + + + diff --git a/crates/bpe/benches/result/reports/counting-o200k/lines.svg b/crates/bpe/benches/result/reports/counting-o200k/lines.svg index e69de29..396969a 100644 --- a/crates/bpe/benches/result/reports/counting-o200k/lines.svg +++ b/crates/bpe/benches/result/reports/counting-o200k/lines.svg @@ -0,0 +1,217 @@ + + + +Gnuplot +Produced by GNUPLOT 6.0 patchlevel 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.1 + + + + + + + + + + + + + 1 + + + + + + + + + + + + + 10 + + + + + + + + + + + + + 100 + + + + + + + + + + + + + 1000 + + + + + 10 + + + + + 100 + + + + + 1000 + + + + + 10000 + + + + + + + + + interval + + + + + interval + + + + + + gnuplot_plot_2 + + + + + + + + + + backtracking + + + + + backtracking + + + + + + gnuplot_plot_4 + + + + + + + + + + + + + + + + + + Average time (µs) + + + + + Input Size (Bytes) + + + + + + + counting-o200k: Comparison + + + + + + + diff --git a/crates/bpe/benches/result/reports/encoding-o200k/lines.svg b/crates/bpe/benches/result/reports/encoding-o200k/lines.svg index e69de29..a54143e 100644 --- a/crates/bpe/benches/result/reports/encoding-o200k/lines.svg +++ b/crates/bpe/benches/result/reports/encoding-o200k/lines.svg @@ -0,0 +1,316 @@ + + + +Gnuplot +Produced by GNUPLOT 6.0 patchlevel 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.0001 + + + + + + + + + + + + + 0.001 + + + + + + + + + + + + + 0.01 + + + + + + + + + + + + + 0.1 + + + + + + + + + + + + + 1 + + + + + + + + + + + + + 10 + + + + + 10 + + + + + 100 + + + + + 1000 + + + + + 10000 + + + + + + + + + backtracking + + + + + backtracking + + + + + + gnuplot_plot_2 + + + + + + + + + + heap + + + + + heap + + + + + + gnuplot_plot_4 + + + + + + + + + + table + + + + + table + + + + + + gnuplot_plot_6 + + + + + + + + + + greedy + + + + + greedy + + + + + + gnuplot_plot_8 + + + + + + + + + + minimal + + + + + minimal + + + + + + gnuplot_plot_10 + + + + + + + + + + tiktoken + + + + + tiktoken + + + + + + gnuplot_plot_12 + + + + + + + + + + + + + + + + + + Average time (ms) + + + + + Input Size (Bytes) + + + + + + + encoding-o200k: Comparison + + + + + + + From 0c66cabca987fe8d09813e8f1019c3ee4c26ea06 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Tue, 1 Oct 2024 13:34:55 +0200 Subject: [PATCH 06/15] Set image background --- crates/bpe/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index 41e1993..e74695b 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -203,16 +203,16 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac Results for counting o200k tokens for random 10000 byte slices. The setup time of the interval encoder is comparable to backtracking. After setup counting of slices of the original data are approximately constant time. -![counting runtime comparison](./benches/result/reports/counting-o200k/lines.svg) + ### Encoding results Results for encoding o200k tokens for random 1000 bytes. The backtracking encoder consistently outperforms tiktoken by a constant factor. -![encoding runtime comparison](./benches/result/reports/encoding-o200k/lines.svg) + ### Incremental encoding results Results for incrementally encoding o200k tokens by appending 10000 random bytes. The appending encoder is slower by a constant factor but overall has similar performance curve as the backtracking encoder encoding all data at once. -![appending runtime comparison](./benches/result/reports/appending-o200k/lines.svg) + From f05019b711671d2ade8317343bb9fdb3f87196ea Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Tue, 1 Oct 2024 13:44:16 +0200 Subject: [PATCH 07/15] Add script to copy and process benchmark figures --- crates/bpe/.gitignore | 10 - crates/bpe/README.md | 8 +- crates/bpe/benches/result/appending-o200k.svg | 52 +++ crates/bpe/benches/result/counting-o200k.svg | 48 +++ crates/bpe/benches/result/encoding-o200k.svg | 76 +++++ .../result/reports/appending-o200k/lines.svg | 232 ------------- .../result/reports/counting-o200k/lines.svg | 217 ------------ .../result/reports/encoding-o200k/lines.svg | 316 ------------------ crates/bpe/criterion.toml | 2 +- crates/bpe/script/copy-benchmark-results | 11 + 10 files changed, 192 insertions(+), 780 deletions(-) delete mode 100644 crates/bpe/.gitignore create mode 100644 crates/bpe/benches/result/appending-o200k.svg create mode 100644 crates/bpe/benches/result/counting-o200k.svg create mode 100644 crates/bpe/benches/result/encoding-o200k.svg delete mode 100644 crates/bpe/benches/result/reports/appending-o200k/lines.svg delete mode 100644 crates/bpe/benches/result/reports/counting-o200k/lines.svg delete mode 100644 crates/bpe/benches/result/reports/encoding-o200k/lines.svg create mode 100755 crates/bpe/script/copy-benchmark-results diff --git a/crates/bpe/.gitignore b/crates/bpe/.gitignore deleted file mode 100644 index da6881e..0000000 --- a/crates/bpe/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -# Ignore benchmark results except figures references in the README. -# Negated ignore patterns do not work for files inside a directory that is itself ignored. -# Therefore ignore using `**` and then negate the nested directories (but not the files inside). -/benches/result/** -!/benches/result/*/ -!/benches/result/*/*/ -# Negate the actual figures we want to keep. -!/benches/result/reports/counting-o200k/lines.svg -!/benches/result/reports/encoding-o200k/lines.svg -!/benches/result/reports/appending-o200k/lines.svg diff --git a/crates/bpe/README.md b/crates/bpe/README.md index e74695b..ab624be 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -203,16 +203,16 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac Results for counting o200k tokens for random 10000 byte slices. The setup time of the interval encoder is comparable to backtracking. After setup counting of slices of the original data are approximately constant time. - +![counting runtime comparison](./benches/result/counting-o200k.svg) ### Encoding results Results for encoding o200k tokens for random 1000 bytes. The backtracking encoder consistently outperforms tiktoken by a constant factor. - +![encoding runtime comparison](./benches/result/encoding-o200k.svg) ### Incremental encoding results -Results for incrementally encoding o200k tokens by appending 10000 random bytes. The appending encoder is slower by a constant factor but overall has similar performance curve as the backtracking encoder encoding all data at once. +Results for incrementally encoding o200k tokens by appending 10000 random bytes. The appending encoder is slower by a constant factor but overall has similar performance curve as the backtracking encoder encoding all data at once. - +![appending runtime comparison](./benches/result/appending-o200k.svg) diff --git a/crates/bpe/benches/result/appending-o200k.svg b/crates/bpe/benches/result/appending-o200k.svg new file mode 100644 index 0000000..a7cadf9 --- /dev/null +++ b/crates/bpe/benches/result/appending-o200k.svg @@ -0,0 +1,52 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/crates/bpe/benches/result/counting-o200k.svg b/crates/bpe/benches/result/counting-o200k.svg new file mode 100644 index 0000000..b84d4c9 --- /dev/null +++ b/crates/bpe/benches/result/counting-o200k.svg @@ -0,0 +1,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/crates/bpe/benches/result/encoding-o200k.svg b/crates/bpe/benches/result/encoding-o200k.svg new file mode 100644 index 0000000..8a8259b --- /dev/null +++ b/crates/bpe/benches/result/encoding-o200k.svg @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/crates/bpe/benches/result/reports/appending-o200k/lines.svg b/crates/bpe/benches/result/reports/appending-o200k/lines.svg deleted file mode 100644 index c114d21..0000000 --- a/crates/bpe/benches/result/reports/appending-o200k/lines.svg +++ /dev/null @@ -1,232 +0,0 @@ - - - -Gnuplot -Produced by GNUPLOT 6.0 patchlevel 1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 0.0001 - - - - - - - - - - - - - 0.001 - - - - - - - - - - - - - 0.01 - - - - - - - - - - - - - 0.1 - - - - - - - - - - - - - 1 - - - - - - - - - - - - - 10 - - - - - 10 - - - - - 100 - - - - - 1000 - - - - - 10000 - - - - - - - - - appending - - - - - appending - - - - - - gnuplot_plot_2 - - - - - - - - - - backtracking - - - - - backtracking - - - - - - gnuplot_plot_4 - - - - - - - - - - - - - - - - - - Average time (ms) - - - - - Input Size (Bytes) - - - - - - - appending-o200k: Comparison - - - - - - - diff --git a/crates/bpe/benches/result/reports/counting-o200k/lines.svg b/crates/bpe/benches/result/reports/counting-o200k/lines.svg deleted file mode 100644 index 396969a..0000000 --- a/crates/bpe/benches/result/reports/counting-o200k/lines.svg +++ /dev/null @@ -1,217 +0,0 @@ - - - -Gnuplot -Produced by GNUPLOT 6.0 patchlevel 1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 0.1 - - - - - - - - - - - - - 1 - - - - - - - - - - - - - 10 - - - - - - - - - - - - - 100 - - - - - - - - - - - - - 1000 - - - - - 10 - - - - - 100 - - - - - 1000 - - - - - 10000 - - - - - - - - - interval - - - - - interval - - - - - - gnuplot_plot_2 - - - - - - - - - - backtracking - - - - - backtracking - - - - - - gnuplot_plot_4 - - - - - - - - - - - - - - - - - - Average time (µs) - - - - - Input Size (Bytes) - - - - - - - counting-o200k: Comparison - - - - - - - diff --git a/crates/bpe/benches/result/reports/encoding-o200k/lines.svg b/crates/bpe/benches/result/reports/encoding-o200k/lines.svg deleted file mode 100644 index a54143e..0000000 --- a/crates/bpe/benches/result/reports/encoding-o200k/lines.svg +++ /dev/null @@ -1,316 +0,0 @@ - - - -Gnuplot -Produced by GNUPLOT 6.0 patchlevel 1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 0.0001 - - - - - - - - - - - - - 0.001 - - - - - - - - - - - - - 0.01 - - - - - - - - - - - - - 0.1 - - - - - - - - - - - - - 1 - - - - - - - - - - - - - 10 - - - - - 10 - - - - - 100 - - - - - 1000 - - - - - 10000 - - - - - - - - - backtracking - - - - - backtracking - - - - - - gnuplot_plot_2 - - - - - - - - - - heap - - - - - heap - - - - - - gnuplot_plot_4 - - - - - - - - - - table - - - - - table - - - - - - gnuplot_plot_6 - - - - - - - - - - greedy - - - - - greedy - - - - - - gnuplot_plot_8 - - - - - - - - - - minimal - - - - - minimal - - - - - - gnuplot_plot_10 - - - - - - - - - - tiktoken - - - - - tiktoken - - - - - - gnuplot_plot_12 - - - - - - - - - - - - - - - - - - Average time (ms) - - - - - Input Size (Bytes) - - - - - - - encoding-o200k: Comparison - - - - - - - diff --git a/crates/bpe/criterion.toml b/crates/bpe/criterion.toml index ada40f9..c0f42f2 100644 --- a/crates/bpe/criterion.toml +++ b/crates/bpe/criterion.toml @@ -1,2 +1,2 @@ # save report in this directory, even if a custom target directory is set -criterion_home = "./benches/result" +criterion_home = "./target/criterion" diff --git a/crates/bpe/script/copy-benchmark-results b/crates/bpe/script/copy-benchmark-results new file mode 100755 index 0000000..df9e97f --- /dev/null +++ b/crates/bpe/script/copy-benchmark-results @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -eu + +result_dir="benches/result" + +mkdir -p "$result_dir" + +for i in {counting,encoding,appending}-o200k; do + rsvg-convert --format svg --output "$result_dir/$i.svg" --background-color white "target/criterion/reports/$i/lines.svg" +done From 10d1784f1cbec0e3654cb79a3916f0f29f8a18de Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Tue, 1 Oct 2024 13:47:10 +0200 Subject: [PATCH 08/15] Add benchmark instructions --- crates/bpe/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index ab624be..94e2c6e 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -216,3 +216,20 @@ Results for encoding o200k tokens for random 1000 bytes. The backtracking encode Results for incrementally encoding o200k tokens by appending 10000 random bytes. The appending encoder is slower by a constant factor but overall has similar performance curve as the backtracking encoder encoding all data at once. ![appending runtime comparison](./benches/result/appending-o200k.svg) + +### Running the benchmarks + +Run the benchmark as follows (required [cargo-criterion](https://crates.io/crates/cargo-criterion) installed): + +```sh +cargo criterion +``` + +(Using `cargo bench` ignores the settings in `criterion.toml`!) +Open the full report which should be located in `target/criterion/reports/index.html`. + +Update the figures in this repo as follows (requires `rsvg-convert` from `librsvg` installed): + +```sh +script/copy-benchmark-results +``` From 215b41b603b856fde4d2766408b0e5ec0ad2e2bb Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Tue, 1 Oct 2024 18:22:46 +0200 Subject: [PATCH 09/15] Change graph colors --- crates/bpe/benches/result/appending-o200k.svg | 20 +++---- crates/bpe/benches/result/counting-o200k.svg | 20 +++---- crates/bpe/benches/result/encoding-o200k.svg | 60 +++++++++---------- crates/bpe/criterion.toml | 14 +++++ 4 files changed, 64 insertions(+), 50 deletions(-) diff --git a/crates/bpe/benches/result/appending-o200k.svg b/crates/bpe/benches/result/appending-o200k.svg index a7cadf9..214e396 100644 --- a/crates/bpe/benches/result/appending-o200k.svg +++ b/crates/bpe/benches/result/appending-o200k.svg @@ -34,17 +34,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/benches/result/counting-o200k.svg b/crates/bpe/benches/result/counting-o200k.svg index b84d4c9..6bed484 100644 --- a/crates/bpe/benches/result/counting-o200k.svg +++ b/crates/bpe/benches/result/counting-o200k.svg @@ -30,17 +30,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/benches/result/encoding-o200k.svg b/crates/bpe/benches/result/encoding-o200k.svg index 8a8259b..22871e4 100644 --- a/crates/bpe/benches/result/encoding-o200k.svg +++ b/crates/bpe/benches/result/encoding-o200k.svg @@ -34,41 +34,41 @@ - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/criterion.toml b/crates/bpe/criterion.toml index c0f42f2..a954003 100644 --- a/crates/bpe/criterion.toml +++ b/crates/bpe/criterion.toml @@ -1,2 +1,16 @@ # save report in this directory, even if a custom target directory is set criterion_home = "./target/criterion" + +# The colors table allows users to configure the colors used by the charts +# cargo-criterion generates. +[colors] +# Color-blind friendly color scheme from https://personal.sron.nl/~pault/. +comparison_colors = [ + {r = 102, g = 204, b = 238}, # cyan + {r = 204, g = 187, b = 68}, # yellow + {r = 238, g = 102, b = 119}, # red + {r = 68, g = 119, b = 170}, # blue + {r = 170, g = 51, b = 119}, # purple + {r = 34, g = 136, b = 51}, # green +# {r = 187, g = 187, b = 187}, # grey +] From 0fdb60fdbb39a603ab17a17d1f75177f97a9c5a5 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Tue, 1 Oct 2024 19:52:14 +0200 Subject: [PATCH 10/15] Update benchmark text --- crates/bpe/README.md | 86 ++++++++++++++----- crates/bpe/benches/performance.rs | 10 +-- crates/bpe/benches/result/appending-o200k.svg | 20 ++--- crates/bpe/benches/result/counting-o200k.svg | 20 ++--- crates/bpe/benches/result/encoding-o200k.svg | 60 ++++++------- 5 files changed, 119 insertions(+), 77 deletions(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index 94e2c6e..e279527 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -183,40 +183,82 @@ On average it is about ~4 faster, since the short-cuts usually pay off. ## Benchmarks -We compared our implementations with the tiktoken implementation on a MacBook Pro on a random input sequence: - -| Algorithm | Runtime | correct BPE output | -| ------------ | -------- | ---------- | -| Greedy | 100 µs | ✘ | -| Minimal | 300 µs | ✘ | -| Backtracking | 400 µs | ✔ | -| Dynamic Programming | 1300 µs | ✔ | -| TikToken | 1500 µs | ✘ | -| Heap | 1900 µs | ✔ | - -As can be seen, our Backtracking implementation beats the TikToken Rust implementation by ~4x. -And even the fully dynamic programming solution is faster with a more consistent runtime. -The tuned heap implementation is still quite competitive to TikToken (especially for smaller inputs). -If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners. +We ran several benchmarks to compare performance between different encoders and with the tiktoken library: -### Counting results +- The first measuers encoding runtime for our different encoders and the tiktoken Rust implementation. + This shows a ~3.5x performance increase for our fastest correct encoder comapred to the tiktoken library. -Results for counting o200k tokens for random 10000 byte slices. The setup time of the interval encoder is comparable to backtracking. After setup counting of slices of the original data are approximately constant time. +- The second measures incremental encoding runtime, where the text is built up byte-by-byte. + This mode is not available in tiktoken, which only supports counting/encoding a complete text. -![counting runtime comparison](./benches/result/counting-o200k.svg) +- The third measures interval counting runtime, where the token count for slices of an original text are determined. + After the initial tokenization of the text, token counting for slices is typically constant time. + This mode is not available in tiktoken, which only supports counting/encoding a complete text. + +All benchmarks were run on a MacBook Pro M1. + +### Encoding + +Encoding is computing the tokens for a given text. +This benchmark uses several encoders: -### Encoding results +- The backtracking encoder uses a backtracking algorithm based on a string matching automaton. +- The heap encoder uses a priority heap to implement the traditional BPE algorithm. +- The table encoder uses a dynamic programming algorithm. -Results for encoding o200k tokens for random 1000 bytes. The backtracking encoder consistently outperforms tiktoken by a constant factor. +Two additional encoders are included that are faster but do not always give exact results: + +- The greedy encoder uses a left-to-right greedy algorithm. +- The minimal encoder computes an encoding with the minimal number of tokens. + +The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 1000 from a random 20000 token original text using the o200k token set. +(All encodings were computed from scratch for each slice.) + +The graph below shows encoding runtime vs slice length. +All encoders show similar runtime increases with increasing slice length. +The backtracking encoder, the fastest encoder that still returns correct results, shows a performance gain of approximately 3.5x compared to tiktoken. +The fully dynamic programming solution and the heap implementation are still quite competitive to TikToken (especially for smaller inputs). +If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners. ![encoding runtime comparison](./benches/result/encoding-o200k.svg) -### Incremental encoding results +### Incremental encoding + +Incremental encoding tokenizes a text to which bytes are appended. +This benchmark uses two encoders: -Results for incrementally encoding o200k tokens by appending 10000 random bytes. The appending encoder is slower by a constant factor but overall has similar performance curve as the backtracking encoder encoding all data at once. +- The backtracking encoder, which retokenizes the text froms cratch every time it changes. +- The appending encoder, which supports incremental encoding when bytes are added. + +The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 1000 from a random 20000 token original using the o200k token set. +The backtracking encoder encoded the final text in one go. +The appending encoder got the text bytes on by one. + +The graph below shows encoding runtime vs slice length. +Runtime of both encoders grows similarly with slice length. +The incremental encoder shows a constant factor overhead. +Note that this is still a huge win for incremental use cases, which would otherwise require retokenization after each append, resulting in a quadratic slowdown. ![appending runtime comparison](./benches/result/appending-o200k.svg) +### Interval counting + +Interval counting is counting the tokens for a slice of an original text. +This benchmark uses two encoders: + +- The backtracking encoder encodes the slice from scratch. + This is similar to what one has to do with other libraries, like `tiktoken`. +- The interval encoder encodes the original text once and reuses that encoding to count tokens for intervals of the original text. + The initial encoding time for the interval encoder is comparable to that of the backtracking encoder. + +The benchmark measured the runtime of counting o200k tokens on slices of lengths 10, 100, 1000, and 1000 from a random 20000 token original text. + +The graph below shows counting runtime vs slice length. +The runtime of the backtracking encoder grows with the length of the slice. +The interval encoder counts any interval in typically constant time. + +![counting runtime comparison](./benches/result/counting-o200k.svg) + ### Running the benchmarks Run the benchmark as follows (required [cargo-criterion](https://crates.io/crates/cargo-criterion) installed): diff --git a/crates/bpe/benches/performance.rs b/crates/bpe/benches/performance.rs index 336c4a2..4cff09c 100644 --- a/crates/bpe/benches/performance.rs +++ b/crates/bpe/benches/performance.rs @@ -28,8 +28,8 @@ static TOKENIZERS: LazyLock<[(&'static str, &'static BytePairEncoding, CoreBPE); fn counting_benchmark(c: &mut Criterion) { for (name, bpe, _) in TOKENIZERS.iter() { - let input = create_test_bytes(&bpe, 20000); - let fast = IntervalEncoding::new(&bpe, &input); + let input = create_test_bytes(bpe, 20000); + let fast = IntervalEncoding::new(bpe, &input); let mut group = c.benchmark_group(format!("counting-{name}")); group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); @@ -60,7 +60,7 @@ fn counting_benchmark(c: &mut Criterion) { fn encoding_benchmark(c: &mut Criterion) { for (name, bpe, tiktoken) in TOKENIZERS.iter() { - let text = create_test_string(&bpe, 20000); + let text = create_test_string(bpe, 20000); let input = text.as_bytes(); let mut group = c.benchmark_group(format!("encoding-{name}")); @@ -126,7 +126,7 @@ fn encoding_benchmark(c: &mut Criterion) { fn appending_benchmark(c: &mut Criterion) { for (name, bpe, _) in TOKENIZERS.iter() { - let input = create_test_bytes(&bpe, 20000); + let input = create_test_bytes(bpe, 20000); let mut group = c.benchmark_group(format!("appending-{name}")); group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); @@ -140,7 +140,7 @@ fn appending_benchmark(c: &mut Criterion) { AppendableEncoder::new(bpe), ) }, - |(start, mut enc)| enc.extend(input[start..start + bytes].into_iter().copied()), + |(start, mut enc)| enc.extend(input[start..start + bytes].iter().copied()), criterion::BatchSize::SmallInput, ) }); diff --git a/crates/bpe/benches/result/appending-o200k.svg b/crates/bpe/benches/result/appending-o200k.svg index 214e396..f358527 100644 --- a/crates/bpe/benches/result/appending-o200k.svg +++ b/crates/bpe/benches/result/appending-o200k.svg @@ -34,17 +34,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/benches/result/counting-o200k.svg b/crates/bpe/benches/result/counting-o200k.svg index 6bed484..deaf497 100644 --- a/crates/bpe/benches/result/counting-o200k.svg +++ b/crates/bpe/benches/result/counting-o200k.svg @@ -30,17 +30,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/benches/result/encoding-o200k.svg b/crates/bpe/benches/result/encoding-o200k.svg index 22871e4..468755c 100644 --- a/crates/bpe/benches/result/encoding-o200k.svg +++ b/crates/bpe/benches/result/encoding-o200k.svg @@ -34,41 +34,41 @@ - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + From d9b2beece0740235f42a6bacc7a14b7a24e45aea Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Wed, 2 Oct 2024 10:54:36 +0200 Subject: [PATCH 11/15] Text improvements and fixes Co-authored-by: Alexander Neubeck --- crates/bpe/README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index e279527..53bb183 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -185,37 +185,37 @@ On average it is about ~4 faster, since the short-cuts usually pay off. We ran several benchmarks to compare performance between different encoders and with the tiktoken library: -- The first measuers encoding runtime for our different encoders and the tiktoken Rust implementation. - This shows a ~3.5x performance increase for our fastest correct encoder comapred to the tiktoken library. +- The first measures encoding runtime for our different encoders and the tiktoken Rust implementation. + This shows a ~3.5x performance improvement for our fastest correct encoder compared to the tiktoken library. - The second measures incremental encoding runtime, where the text is built up byte-by-byte. This mode is not available in tiktoken, which only supports counting/encoding a complete text. -- The third measures interval counting runtime, where the token count for slices of an original text are determined. - After the initial tokenization of the text, token counting for slices is typically constant time. +- The third measures interval counting runtime, where tokens of sub-slices of a fixed text are counted. + The data structure we built specifically for this purpose can answer those interval counting requests in typically constant times after the initial linear preprocessing of the text. This mode is not available in tiktoken, which only supports counting/encoding a complete text. -All benchmarks were run on a MacBook Pro M1. +All benchmarks were run single-threaded on a MacBook Pro M1. ### Encoding Encoding is computing the tokens for a given text. -This benchmark uses several encoders: +This benchmark compares several encoders: -- The backtracking encoder uses a backtracking algorithm based on a string matching automaton. -- The heap encoder uses a priority heap to implement the traditional BPE algorithm. -- The table encoder uses a dynamic programming algorithm. +- The backtracking encoder uses the backtracking algorithm with memorisation based on top of a string matching automaton. +- The heap encoder uses a priority heap and a bitmask to represent token positions to implement the traditional BPE algorithm. +- The table encoder implements the raw dynamic programming algorithm proposed above. -Two additional encoders are included that are faster but do not always give exact results: +Two additional encoders are included that are faster but deviate from the original BPE encoding strategy: -- The greedy encoder uses a left-to-right greedy algorithm. +- The greedy encoder picks the left-longest token. - The minimal encoder computes an encoding with the minimal number of tokens. The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 1000 from a random 20000 token original text using the o200k token set. (All encodings were computed from scratch for each slice.) The graph below shows encoding runtime vs slice length. -All encoders show similar runtime increases with increasing slice length. +All encoders (except the heap encoder) show the expected linear runtime complexity. The backtracking encoder, the fastest encoder that still returns correct results, shows a performance gain of approximately 3.5x compared to tiktoken. The fully dynamic programming solution and the heap implementation are still quite competitive to TikToken (especially for smaller inputs). If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners. @@ -224,7 +224,7 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac ### Incremental encoding -Incremental encoding tokenizes a text to which bytes are appended. +Incremental encoding tokenizes a text while appending bytes. This type of algorithm is interesting for use cases where a certain token budget must not be exceeded. This benchmark uses two encoders: - The backtracking encoder, which retokenizes the text froms cratch every time it changes. @@ -251,7 +251,7 @@ This benchmark uses two encoders: - The interval encoder encodes the original text once and reuses that encoding to count tokens for intervals of the original text. The initial encoding time for the interval encoder is comparable to that of the backtracking encoder. -The benchmark measured the runtime of counting o200k tokens on slices of lengths 10, 100, 1000, and 1000 from a random 20000 token original text. +The benchmark measured the runtime of counting o200k tokens on slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text. The graph below shows counting runtime vs slice length. The runtime of the backtracking encoder grows with the length of the slice. From 814ef8d4322f4a70277b371cb9db977b02e67aca Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Wed, 2 Oct 2024 11:55:28 +0200 Subject: [PATCH 12/15] Fix example and add test reproducing it --- crates/bpe/README.md | 18 ++++---- crates/bpe/src/byte_pair_encoding.rs | 11 +++-- crates/bpe/src/lib.rs | 61 ++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index 53bb183..a83e630 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -94,35 +94,35 @@ Given a valid encoding sequence `e_0..e_i` and a valid encoding tuple `e_i e_j`, ## Novel Algorithm At a first glance, it seems impossible to achieve `O(n)` complexity while preserving the encoding output of the original BPE algorithm, since the original BPE algorithm needs to first scan the full input before it can make any encoding decision. -For instance, the sequence `abab` would be encoded as `ab ab` when the dictionary contains the tokens `a b ab ba bc abc babc ababc` ordered by frequency. But appending a single character `ababc` would result in a pretty different tokenization: `ab a bc`. So without looking ahead it seems impossible to properly tokenize the text. +For instance, the sequence `abac` would be encoded as `ab ac` when the dictionary contains the tokens `a b c ab cb ac` ordered by frequency. But appending a single character `abacb` would result in a pretty different tokenization: `ab a cb`. So without looking ahead it seems impossible to properly tokenize the text. -The solution is to track the encodings of ALL text prefixes. For our example `ababc` we would get: +The solution is to track the encodings of ALL text prefixes. For our example `abacb` we would get: - `a` ------> `a` - `ab` -----> `ab` - `aba` ----> `ab a` -- `abab` ---> `ab ab` -- `ababc` --> `ab a bc` +- `abab` ---> `ab ac` +- `ababc` --> `ab a cb` This can be done much more efficiently thanks to Corollary IIa, since now only the last token of every prefix has to be remembered: - `a` ------> `a` - `ab` -----> `ab` - `aba` ----> `a` -- `abab` ---> `ab` -- `ababc` --> `bc` +- `abac` ---> `ac` +- `abacb` --> `bc` In order to reconstruct the full encoding for a specific prefix, one simply starts with the last token of that prefix, shortens the prefix by the extracted token and looks up the token associated with the shortened prefix and so on until the beginning of the text is reached. -For our example prefix `ababc`, this procedure executes the following steps and determines the correct encoding in reverse order: +For our example prefix `abacb`, this procedure executes the following steps and determines the correct encoding in reverse order: -- `ababc` -> `bc` +- `abacb` -> `cb` - `aba` ---> `a` - `ab` ----> `ab` - `` The actual challenge is to determine for every prefix this last token efficiently. -The prefix `abab` could for instance end with either the token `b` or `ab`, but only `ab` leads to a valid encoding sequence. +The prefix `abac` could for instance end with either the token `c` or `ac`, but only `ac` leads to a valid encoding sequence. But, Corollary IIa tells us that **one and only one** last token can be the correct one and Corollary IIIa shows us how to find it: We only have to check whether a possible next token is "compatible" with its previous token, i.e. whether the two tokens form a valid encoding sequence. diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs index d66b8bd..72fa946 100644 --- a/crates/bpe/src/byte_pair_encoding.rs +++ b/crates/bpe/src/byte_pair_encoding.rs @@ -176,12 +176,12 @@ pub fn find_hash_factor_for_tiktoken(bpe: &tiktoken_rs::CoreBPE, len: usize) -> /// Find a suitable hash factor for a set of given tokens that prevents collisions when /// constructing a [`BytePairEncoding`] from those tokens. #[cfg(feature = "rand")] -pub fn find_hash_factor_for_dictionary(iter: impl Iterator>) -> u64 { +pub fn find_hash_factor_for_dictionary(tokens: impl IntoIterator>) -> u64 { use std::collections::HashSet; use rand::Rng; - let all_tokens = iter.collect_vec(); + let all_tokens = tokens.into_iter().collect_vec(); let mut rnd = rand::thread_rng(); loop { let factor: u64 = rnd.gen(); @@ -244,7 +244,10 @@ impl BytePairEncoding { /// /// The recommended approach is to store the serialized value and reuse that, /// to prevent repeating the cost of computing the hash factor and encoding. - pub fn from_dictionary(iter: impl Iterator>, hash_factor: Option) -> Self { + pub fn from_dictionary( + tokens: impl IntoIterator>, + hash_factor: Option, + ) -> Self { let hash_factor = hash_factor .inspect(|f| assert_ne!(*f, 0, "hash factor must be larger than zero")) .unwrap_or(1); @@ -252,7 +255,7 @@ impl BytePairEncoding { let mut all_tokens_rev = Vec::new(); let mut token_starts = vec![0]; let mut bytes_hash_to_token = FnvHashMap::default(); - for (i, token) in iter.enumerate() { + for (i, token) in tokens.into_iter().enumerate() { bytes_hash_to_token.insert(hash_bytes(&token, hash_factor), i as u32); all_tokens_rev.extend(token.iter().copied().rev()); all_tokens.extend(token); diff --git a/crates/bpe/src/lib.rs b/crates/bpe/src/lib.rs index 452024e..2c7ab43 100644 --- a/crates/bpe/src/lib.rs +++ b/crates/bpe/src/lib.rs @@ -4,3 +4,64 @@ mod bitfield; pub mod byte_pair_encoding; pub mod interval_encoding; pub mod prependable_encoder; + +#[cfg(test)] +mod tests { + use itertools::Itertools; + + use crate::byte_pair_encoding::BytePairEncoding; + + /// This test produces the output for the encoding example in the README. + #[test] + fn readme_example() { + let tokens = ["a", "b", "c", "ab", "cb", "ac"].map(|t| t.as_bytes().to_vec()); + let bpe = BytePairEncoding::from_dictionary(tokens, None); + let text = "abacb"; + let prefixes = (1..=text.len()).map(|end| &text[..end]).collect_vec(); + let all_prefix_tokens = prefixes + .iter() + .map(|prefix| { + bpe.encode_via_backtracking(prefix.as_bytes()) + .into_iter() + .map(|t| unsafe { String::from_utf8_unchecked(bpe.decode_tokens(&[t])) }) + .collect_vec() + }) + .collect_vec(); + let last_prefix_tokens = all_prefix_tokens + .iter() + .map(|tokens| tokens.last().unwrap()) + .collect_vec(); + + println!("All tokens for each prefix of `{text}`:\n"); + for (prefix, tokens) in prefixes.iter().zip(&all_prefix_tokens) { + println!( + "- `{prefix}` {}> `{}`", + "-".repeat(text.len() + 2 - prefix.len()), + tokens.join(" ") + ); + } + println!(); + + println!("Last token for each prefix of `{text}`:\n"); + for (prefix, token) in prefixes.iter().zip(&last_prefix_tokens) { + println!( + "- `{prefix}` {}> `{token}`", + "-".repeat(text.len() + 2 - prefix.len()), + ); + } + println!(); + + println!("Tokenization of `{text}`:\n"); + let mut remaining = text.len(); + while remaining > 0 { + let prefix = &text[..remaining]; + let token = last_prefix_tokens[remaining - 1]; + println!( + "- `{prefix}` {}> `{token}`", + "-".repeat(text.len() + 2 - prefix.len()), + ); + remaining -= token.len(); + } + println!("- ``"); + } +} From 8c9e05bb0c0aa065e71504c08b5465d2fd7a4aaf Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Wed, 2 Oct 2024 11:55:52 +0200 Subject: [PATCH 13/15] Link to tiktoken crate used --- crates/bpe/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index a83e630..c60fcba 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -183,7 +183,7 @@ On average it is about ~4 faster, since the short-cuts usually pay off. ## Benchmarks -We ran several benchmarks to compare performance between different encoders and with the tiktoken library: +We ran several benchmarks to compare performance of different encoders and the [tiktoken-rs](https://crates.io/crates/tiktoken-rs) library (a wrapper around OpenAI's tiktoken implementation): - The first measures encoding runtime for our different encoders and the tiktoken Rust implementation. This shows a ~3.5x performance improvement for our fastest correct encoder compared to the tiktoken library. From dc4338df20c72d08d67289d3571b0f420a47ff11 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Wed, 2 Oct 2024 12:21:06 +0200 Subject: [PATCH 14/15] Rephrase incremental benchmark description --- crates/bpe/README.md | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index c60fcba..97eab2a 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -211,7 +211,7 @@ Two additional encoders are included that are faster but deviate from the origin - The greedy encoder picks the left-longest token. - The minimal encoder computes an encoding with the minimal number of tokens. -The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 1000 from a random 20000 token original text using the o200k token set. +The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text using the o200k token set. (All encodings were computed from scratch for each slice.) The graph below shows encoding runtime vs slice length. @@ -224,20 +224,16 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac ### Incremental encoding -Incremental encoding tokenizes a text while appending bytes. This type of algorithm is interesting for use cases where a certain token budget must not be exceeded. -This benchmark uses two encoders: - -- The backtracking encoder, which retokenizes the text froms cratch every time it changes. -- The appending encoder, which supports incremental encoding when bytes are added. +Incremental encoding tokenizes a text while appending bytes. +This type of algorithm is interesting for use cases where a certain token budget must not be exceeded. +This benchmark shows the runtime for the appending encoder when a text is encoded byte-by-byte. +For comparison we show the runtime of the backtracking encoder when it encodes the whole text at once. -The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 1000 from a random 20000 token original using the o200k token set. -The backtracking encoder encoded the final text in one go. -The appending encoder got the text bytes on by one. +The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original using the o200k token set. The graph below shows encoding runtime vs slice length. -Runtime of both encoders grows similarly with slice length. -The incremental encoder shows a constant factor overhead. -Note that this is still a huge win for incremental use cases, which would otherwise require retokenization after each append, resulting in a quadratic slowdown. +The overall runtime of byte-by-byte incremental encoder for encoding the full text is comparable to the runtime of the backtracking encoder, with only a constant factor overhead. +Note that this is a huge win for incremental use cases, which would otherwise require retokenization after each append, resulting in a quadratic slowdown. ![appending runtime comparison](./benches/result/appending-o200k.svg) From 537699192c9eff01f906e785f549f06eb88217af Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Wed, 2 Oct 2024 12:48:06 +0200 Subject: [PATCH 15/15] Add note about tiktoken prechunking --- crates/bpe/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index 97eab2a..0cd4c58 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -183,7 +183,12 @@ On average it is about ~4 faster, since the short-cuts usually pay off. ## Benchmarks -We ran several benchmarks to compare performance of different encoders and the [tiktoken-rs](https://crates.io/crates/tiktoken-rs) library (a wrapper around OpenAI's tiktoken implementation): +We ran several benchmarks to compare performance of different encoders and a tiktoken implementation. +For the tiktoken implementation we used [tiktoken-rs](https://crates.io/crates/tiktoken-rs) library, a wrapper around OpenAI's tiktoken implementation. +Note that tiktoken does not run BPE on the full input text. +Instead it splits it into large chunks using a regex and runs BPE on the individual chunks. +We have not tried to see if that approach is compatible with our BPE implementation. +We benchmarked the following scenarios: - The first measures encoding runtime for our different encoders and the tiktoken Rust implementation. This shows a ~3.5x performance improvement for our fastest correct encoder compared to the tiktoken library.