From 63a3bca86151b9f51b78a4fd3f29d4ad6730ae74 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Tue, 18 Nov 2025 13:44:38 -0800 Subject: [PATCH 1/2] Support gpt-5.1 model in Tiktoken tokenizer --- src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs | 2 ++ test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs index 0b9e64cec9..ca6b75816e 100644 --- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs @@ -1044,6 +1044,7 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo ( "o4-mini-", ModelEncoding.O200kBase ), // e.g. o4-mini // chat + ( "gpt-5.1-", ModelEncoding.O200kBase), ( "gpt-5-", ModelEncoding.O200kBase), ( "gpt-4.1-", ModelEncoding.O200kBase), // e.g., gpt-4.1-mini ( "gpt-4.5-", ModelEncoding.O200kBase), // e.g., gpt-4.5 @@ -1071,6 +1072,7 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo { "o4-mini", ModelEncoding.O200kBase }, // chat + { "gpt-5.1", ModelEncoding.O200kBase }, { "gpt-5", ModelEncoding.O200kBase }, { "gpt-4.1", ModelEncoding.O200kBase }, { "gpt-4o", ModelEncoding.O200kBase }, diff --git a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs index e7a0bf5acc..b77f981aec 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs +++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs @@ -36,6 +36,7 @@ public class TiktokenTests public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001"); public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o"); public static Tokenizer GPT5 { get; } = TiktokenTokenizer.CreateForModel("gpt-5"); + public static Tokenizer GPT5_1 { get; } = TiktokenTokenizer.CreateForModel("gpt-5.1"); public static Tokenizer Phi4 { get; } = TiktokenTokenizer.CreateForModel("phi-4"); public static TiktokenTokenizer GptOss { get; } = TiktokenTokenizer.CreateForModel("gpt-oss-20b"); @@ -286,7 +287,7 @@ public void TestEncode5() [Fact] public void TestEncodeO200kBaseEncoding() { - foreach (TiktokenTokenizer tokenizer in new[] { GPT4o, GptOss, GPT5 }) + foreach (TiktokenTokenizer tokenizer in new[] { GPT4o, GptOss, GPT5, GPT5_1 }) { string text = ReadAndSanitizeFile("./Data/lib.rs.txt"); IReadOnlyList encoded = tokenizer.EncodeToIds(text); @@ -415,6 +416,8 @@ public void TestEncodeR50kBase() [InlineData("gpt-4o-")] [InlineData("gpt-5")] [InlineData("gpt-5-chat")] + [InlineData("gpt-5.1")] + [InlineData("gpt-5.1-mini")] [InlineData("chatgpt-4o-")] [InlineData("gpt-4")] [InlineData("gpt-4-")] @@ -533,6 +536,7 @@ public void TestEncodingNamesNegativeCases() [InlineData("gpt-4.1")] [InlineData("gpt-4o")] [InlineData("gpt-5")] + [InlineData("gpt-5.1")] [InlineData("o1")] [InlineData("o3")] [InlineData("o4-mini")] From aace8b893ef0a34f6fbcc2e47037491a421350a6 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Tue, 18 Nov 2025 13:59:59 -0800 Subject: [PATCH 2/2] Fix spaces --- .../Model/TiktokenTokenizer.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs index ca6b75816e..b322be3a09 100644 --- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs @@ -1044,14 +1044,14 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo ( "o4-mini-", ModelEncoding.O200kBase ), // e.g. o4-mini // chat - ( "gpt-5.1-", ModelEncoding.O200kBase), - ( "gpt-5-", ModelEncoding.O200kBase), - ( "gpt-4.1-", ModelEncoding.O200kBase), // e.g., gpt-4.1-mini - ( "gpt-4.5-", ModelEncoding.O200kBase), // e.g., gpt-4.5 - ( "gpt-4o-", ModelEncoding.O200kBase), // e.g., gpt-4o-2024-05-13 - ( "chatgpt-4o-", ModelEncoding.O200kBase), - ( "gpt-4-", ModelEncoding.Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k - ( "gpt-3.5-", ModelEncoding.Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc. + ( "gpt-5.1-", ModelEncoding.O200kBase ), + ( "gpt-5-", ModelEncoding.O200kBase ), + ( "gpt-4.1-", ModelEncoding.O200kBase ), // e.g., gpt-4.1-mini + ( "gpt-4.5-", ModelEncoding.O200kBase ), // e.g., gpt-4.5 + ( "gpt-4o-", ModelEncoding.O200kBase ), // e.g., gpt-4o-2024-05-13 + ( "chatgpt-4o-", ModelEncoding.O200kBase ), + ( "gpt-4-", ModelEncoding.Cl100kBase ), // e.g., gpt-4-0314, etc., plus gpt-4-32k + ( "gpt-3.5-", ModelEncoding.Cl100kBase ), // e.g, gpt-3.5-turbo-0301, -0401, etc. ( "gpt-35-", ModelEncoding.Cl100kBase ), // Azure deployment name ( "gpt-oss-", ModelEncoding.O200kHarmony ),