From 18c7913746e2bee2be300c924905a07ca67f4d43 Mon Sep 17 00:00:00 2001
From: Tarek Mahmoud Sayed <tarekms@microsoft.com>
Date: Thu, 2 Oct 2025 14:51:42 -0700
Subject: [PATCH 1/2] BpeTokenizer Cleanup

---
 .../Model/BPETokenizer.cs                     | 35 ++++++-----
 .../Microsoft.ML.Tokenizers.Tests/BpeTests.cs | 60 +++++++++++++++++++
 2 files changed, 77 insertions(+), 18 deletions(-)
diff --git a/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
index f17845adf8..d015c4626c 100644
--- a/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
@@ -320,7 +320,7 @@ private BpeTokenizer(
 
             if (beginningOfSentenceToken is not null)
             {
-                if (!_vocab.TryGetValue(beginningOfSentenceToken, out int aId))
+                if (!_vocab.TryGetValue(beginningOfSentenceToken, out int aId) && (specialTokens is null || !specialTokens.TryGetValue(beginningOfSentenceToken, out aId)))
                 {
                     throw new InvalidOperationException($"The beginning of sentence token '{beginningOfSentenceToken}' was not present in the vocabulary.");
                 }
@@ -331,7 +331,7 @@ private BpeTokenizer(
 
             if (endOfSentenceToken is not null)
             {
-                if (!_vocab.TryGetValue(endOfSentenceToken, out int aId))
+                if (!_vocab.TryGetValue(endOfSentenceToken, out int aId) && (specialTokens is null || !specialTokens.TryGetValue(endOfSentenceToken, out aId)))
                 {
                     throw new InvalidOperationException($"The end of sentence token '{endOfSentenceToken}' was not present in the vocabulary.");
                 }
@@ -792,31 +792,30 @@ public string Decode(IEnumerable<int> ids, bool considerSpecialTokens)
 
             ValueStringBuilder sb = new ValueStringBuilder();
 
-            bool decodeUnknownToken = _unknownTokenId.HasValue && considerSpecialTokens;
-
-            if (decodeUnknownToken)
+            foreach (int id in ids)
             {
-                foreach (int id in ids)
+                if (_specialTokensReverse?.TryGetValue(id, out string? token) is true)
                 {
-                    if (MapIdToToken(id) is string s)
+                    if (considerSpecialTokens)
                     {
-                        sb.Append(s);
+                        sb.Append(token);
                     }
+                    continue;
                 }
-            }
-            else
-            {
-                foreach (int id in ids)
+
+                if (id == _unknownTokenId)
                 {
-                    if (id == _unknownTokenId)
+                    if (considerSpecialTokens)
                     {
-                        continue;
+                        Debug.Assert(UnknownToken is not null);
+                        sb.Append(UnknownToken);
                     }
+                    continue;
+                }
 
-                    if (MapIdToToken(id) is string s)
-                    {
-                        sb.Append(s);
-                    }
+                if (MapIdToToken(id) is string s)
+                {
+                    sb.Append(s);
                 }
             }
 
diff --git a/test/Microsoft.ML.Tokenizers.Tests/BpeTests.cs b/test/Microsoft.ML.Tokenizers.Tests/BpeTests.cs
index 7394464b90..5c2da4aece 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/BpeTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/BpeTests.cs
@@ -885,6 +885,66 @@ public void TestDeepSeekR1Tokenizer(string text, int[] ids, string[] tokens, (in
             Assert.Equal(text, tokenizer.Decode(ids, considerSpecialTokens: false));
         }
 
+        [Fact]
+        public void TestTokenizerWithSpecialTokens()
+        {
+            // "https://huggingface.co/openai-community/gpt2/raw/main/vocab.json";
+            // "https://huggingface.co/openai-community/gpt2/raw/main/merges.txt";
+
+            BpeOptions options = new BpeOptions(Path.Combine(@"Gpt-2", "vocab.json"), Path.Combine(@"Gpt-2", "merges.txt"))
+            {
+                UnknownToken = "unk",
+
+                SpecialTokens = new Dictionary<string, int> // SpecialTokens not part of the original vocab.json
+                {
+                    { "<|sos|>", 50257 },
+                    { "<|eos|>", 50258 }
+                },
+                BeginningOfSentenceToken = "<|sos|>",
+                EndOfSentenceToken = "<|eos|>"
+            };
+
+            BpeTokenizer bpeTokenizer = BpeTokenizer.Create(options);
+            Assert.True(bpeTokenizer.Vocabulary.TryGetValue(options.UnknownToken, out int unkId));
+
+            string text = "Hello world!\uD800";
+
+            var ids = bpeTokenizer.EncodeToIds(text, considerPreTokenization: false);
+            Assert.Equal([50257, 15496, 2954, 6894, 0, 2954, 50258], ids); // space and u+D800 couldn't be encoded and produced unk tokens
+            Assert.Equal(unkId, ids[ids.Count - 2]);
+            Assert.Equal(options.SpecialTokens["<|sos|>"], ids[0]);
+            Assert.Equal(options.SpecialTokens["<|eos|>"], ids[^1]);
+
+            var tokens = bpeTokenizer.EncodeToTokens(text, out _, considerPreTokenization: false).Select(t => t.Value).ToArray();
+            Assert.Equal(["<|sos|>", "Hello", "unk", "world", "!", "unk", "<|eos|>"], tokens);
+
+            Assert.Equal("<|sos|>Hellounkworld!unk<|eos|>", bpeTokenizer.Decode(ids));
+            Assert.Equal("Helloworld!", bpeTokenizer.Decode(ids, considerSpecialTokens: false));
+
+            BpeOptions options1 = new BpeOptions(options.Vocabulary)
+            {
+                // Null UnknownToken means no unknown token support
+                Merges = options.Merges,
+                SpecialTokens = options.SpecialTokens,
+                BeginningOfSentenceToken = options.BeginningOfSentenceToken,
+                EndOfSentenceToken = options.EndOfSentenceToken
+            };
+
+            bpeTokenizer = BpeTokenizer.Create(options1);
+            ids = bpeTokenizer.EncodeToIds(text, considerPreTokenization: false);
+
+            // Because Unknown is not supported in this encoding, the encoding will produce different encoding results
+            Assert.Equal([50257, 39, 5037, 1764, 0, 50258], ids);
+            Assert.Equal(options.SpecialTokens["<|sos|>"], ids[0]);
+            Assert.Equal(options.SpecialTokens["<|eos|>"], ids[^1]);
+
+            tokens = bpeTokenizer.EncodeToTokens(text, out _, considerPreTokenization: false).Select(t => t.Value).ToArray();
+            Assert.Equal(["<|sos|>", "H", "ellow", "orld", "!", "<|eos|>"], tokens);
+
+            Assert.Equal("<|sos|>Helloworld!<|eos|>", bpeTokenizer.Decode(ids));
+            Assert.Equal("Helloworld!", bpeTokenizer.Decode(ids, considerSpecialTokens: false));
+        }
+
         private static BpeTokenizer CreateBpeTokenizerFromJson()
         {
             // @"https://huggingface.co/deepseek-ai/DeepSeek-R1/resolve/main/tokenizer.json?download=true"

From 0aea52c0b307847fab3ff6965664486a5004f08e Mon Sep 17 00:00:00 2001
From: Tarek Mahmoud Sayed <10833894+tarekgh@users.noreply.github.com>
Date: Thu, 2 Oct 2025 15:08:30 -0700
Subject: [PATCH 2/2] Apply suggestions from code review

Co-authored-by: Eric StJohn <ericstj@microsoft.com>
---
 src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
index d015c4626c..b9592d2e2b 100644
--- a/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
@@ -320,7 +320,7 @@ private BpeTokenizer(
 
             if (beginningOfSentenceToken is not null)
             {
-                if (!_vocab.TryGetValue(beginningOfSentenceToken, out int aId) && (specialTokens is null || !specialTokens.TryGetValue(beginningOfSentenceToken, out aId)))
+                if (_vocab.TryGetValue(beginningOfSentenceToken, out int aId) is false && specialTokens?.TryGetValue(beginningOfSentenceToken, out aId) is false)
                 {
                     throw new InvalidOperationException($"The beginning of sentence token '{beginningOfSentenceToken}' was not present in the vocabulary.");
                 }
@@ -331,7 +331,7 @@ private BpeTokenizer(
 
             if (endOfSentenceToken is not null)
             {
-                if (!_vocab.TryGetValue(endOfSentenceToken, out int aId) && (specialTokens is null || !specialTokens.TryGetValue(endOfSentenceToken, out aId)))
+                if (_vocab.TryGetValue(endOfSentenceToken, out int aId) is false && specialTokens?.TryGetValue(endOfSentenceToken, out aId) is false)
                 {
                     throw new InvalidOperationException($"The end of sentence token '{endOfSentenceToken}' was not present in the vocabulary.");
                 }