dotnet · tarekgh · Mar 7, 2024 · Mar 4, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/src/Microsoft.ML.Tokenizers/Model/BPE.cs b/src/Microsoft.ML.Tokenizers/Model/BPE.cs
@@ -177,7 +177,7 @@ private Bpe(Stream vocabStream, Stream? mergesStream, string? unknownToken, stri
         /// Encode a text string to a list of tokens.
         /// </summary>
         /// <param name="text">The text to encode.</param>
-        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="isSpecialToken">Indicate if the text is a special token.</param>
         /// <returns>The list of tokens generated from the text tokenization.</returns>
         public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false)
         {
@@ -193,7 +193,7 @@ public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = f
         /// Encode a split text string to a list of Ids and add them to the accumulatedIds list.
         /// </summary>
         /// <param name="text">The text to split.</param>
-        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="isSpecialToken">Indicate if the text is a special token.</param>
         /// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
         public override void EncodeToIds(ReadOnlySpan<char> text, bool isSpecialToken, IList<int> accumulatedIds) => EncodeToIdsWithCache(text, accumulatedIds);
 

diff --git a/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs b/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs
@@ -177,7 +177,7 @@ public EnglishRoberta(Stream vocabularyStream, Stream mergeStream, Stream highes
         /// Encode a text string to a list of tokens.
         /// </summary>
         /// <param name="text">The text to encode.</param>
-        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="isSpecialToken">Indicate if the text is a special token.</param>
         /// <returns>The list of tokens generated from the text tokenization.</returns>
         public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false)
         {
@@ -225,7 +225,7 @@ public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = f
         /// Encode a split text string to a list of Ids and add them to the accumulatedIds list.
         /// </summary>
         /// <param name="text">The text to split.</param>
-        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="isSpecialToken">Indicate if the text is a special token.</param>
         /// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
         public override void EncodeToIds(ReadOnlySpan<char> text, bool isSpecialToken, IList<int> accumulatedIds) => EncodeToIds(text, accumulatedIds);
 

diff --git a/src/Microsoft.ML.Tokenizers/Model/Model.cs b/src/Microsoft.ML.Tokenizers/Model/Model.cs
@@ -17,15 +17,15 @@ public abstract class Model
         /// Encode a text to a list of tokens.
         /// </summary>
         /// <param name="text">The text to encode.</param>
-        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="isSpecialToken">Indicate if the text is a special token.</param>
         /// <returns>The list of tokens generated from the text tokenization.</returns>
         public abstract IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false);
 
         /// <summary>
         /// Encode a text to a list of Ids and add them to the accumulatedIds list.
         /// </summary>
         /// <param name="text">The text to encode.</param>
-        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="isSpecialToken">Indicate if the text is a special token.</param>
         /// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
         /// <remarks>
         /// This method does the default implementation that uses the Encode method to get the token's Ids.

diff --git a/src/Microsoft.ML.Tokenizers/Model/Tiktoken.cs b/src/Microsoft.ML.Tokenizers/Model/Tiktoken.cs
@@ -9,6 +9,7 @@
 using System.IO;
 using System.Linq;
 using System.Text;
+using System.Text.RegularExpressions;
 using System.Threading;
 using System.Threading.Tasks;
 
@@ -100,6 +101,83 @@ private Tiktoken(Stream vocabStream, IReadOnlyDictionary<string, int>? specialTo
             }
         }
 
+        /// <summary>
+        /// Create a Tiktoken tokenizer based on model name and vocab file.
+        /// </summary>
+        /// <param name="modelName">Model name</param>
+        /// <param name="vocabStream">The stream to the BPE vocab file.</param>
+        /// <param name="extraSpecialTokens">Extra special tokens other than the built-in ones for the model</param>
+        /// <param name="cacheSize">The size of the cache to use.</param>
+        /// <param name="normalizer">To normalize the text before tokenization</param>
+        /// <returns>The tokenizer</returns>
+        public static Tokenizer CreateByModelName(
+                                    string modelName,
+                                    Stream vocabStream,
+                                    IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
+                                    int cacheSize = LruCache<int[]>.DefaultCacheSize,
+                                    Normalizer? normalizer = null)
+        {
+            if (string.IsNullOrEmpty(modelName))
+            {
+                throw new ArgumentNullException(nameof(modelName));
+            }
+
+            (Dictionary<string, int> SpecialTokens, Regex Regex) tiktokenConfiguration = Tokenizer.GetTiktokenConfigurations(modelName);
+
+            if (extraSpecialTokens is not null)
+            {
+                foreach (var extraSpecialToken in extraSpecialTokens)
+                {
+                    tiktokenConfiguration.SpecialTokens.Add(extraSpecialToken.Key, extraSpecialToken.Value);
+                }
+            }
+
+            return new Tokenizer(
+                            new Tiktoken(vocabStream, tiktokenConfiguration.SpecialTokens, cacheSize),
+                            new TikTokenPreTokenizer(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens),
+                            normalizer);
+        }
+
+        /// <summary>
+        /// Create a Tiktoken tokenizer based on model name and vocab file.
+        /// </summary>
+        /// <param name="modelName">Model name</param>
+        /// <param name="vocabStream">The stream to the BPE vocab file.</param>
+        /// <param name="extraSpecialTokens">Extra special tokens other than the built-in ones for the model</param>
+        /// <param name="cacheSize">The size of the cache to use.</param>
+        /// <param name="normalizer">To normalize the text before tokenization</param>
+        /// <param name="cancellationToken"><see cref="CancellationToken"/> used to request cancellation of the operation.</param>
+        /// <returns>The tokenizer</returns>
+        public static async Task<Tokenizer> CreateByModelNameAsync(
+                                    string modelName,
+                                    Stream vocabStream,
+                                    IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
+                                    int cacheSize = LruCache<int[]>.DefaultCacheSize,
+                                    Normalizer? normalizer = null,
+                                    CancellationToken cancellationToken = default)
+        {
+            if (string.IsNullOrEmpty(modelName))
+            {
+                throw new ArgumentNullException(nameof(modelName));
+            }
+
+            (Dictionary<string, int> SpecialTokens, Regex Regex) tiktokenConfiguration = Tokenizer.GetTiktokenConfigurations(modelName);
+
+            if (extraSpecialTokens is not null)
+            {
+                foreach (var extraSpecialToken in extraSpecialTokens)
+                {
+                    tiktokenConfiguration.SpecialTokens.Add(extraSpecialToken.Key, extraSpecialToken.Value);
+                }
+            }
+
+            return new Tokenizer(
+                            await CreateAsync(vocabStream, tiktokenConfiguration.SpecialTokens, cacheSize, cancellationToken).ConfigureAwait(false),
+                            new TikTokenPreTokenizer(tiktokenConfiguration.Regex, tiktokenConfiguration.SpecialTokens),
+                            normalizer);
+        }
+
+
         private static (Dictionary<StringSpanOrdinalKey, int>?, Dictionary<int, string>?) CreateEncoderDecoder(IReadOnlyDictionary<string, int>? specialTokens)
         {
             if (specialTokens is not null)
@@ -231,9 +309,9 @@ private static (Dictionary<StringSpanOrdinalKey, int>?, Dictionary<int, string>?
         /// Encode a split text string to a list of tokens.
         /// </summary>
         /// <param name="text">The text to encode.</param>
-        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="isSpecialToken">Indicate if the text is a special token.</param>
         /// <returns>The list of tokens generated from the text tokenization.</returns>
-        public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken)
+        public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false)
         {
             Token[] tokens;
 
@@ -294,7 +372,7 @@ public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken)
         /// Encode text to a list of Ids.
         /// </summary>
         /// <param name="text">The text to encode.</param>
-        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="isSpecialToken">Indicate if the text is a special token.</param>
         /// <param name="accumulatedIds">The list of accumulated Ids.</param>
         public override void EncodeToIds(ReadOnlySpan<char> text, bool isSpecialToken, IList<int> accumulatedIds)
         {
@@ -462,12 +540,14 @@ public override int CountTokens(ReadOnlySpan<char> text, bool isSpecialToken)
         /// <returns>The decoded string.</returns>
         public override string? Decode(IEnumerable<int> ids, TokenizerDecoder? decoder = null, bool considerSpecialTokens = true)
         {
-            // Tiktoken does not ensure a one-to-one mapping between IDs and tokens. Consequently, decoding individual IDs into tokens is not supported;
-            // instead, decoding all IDs must be done collectively.
-            // Here is example of case that map one character to multiple Ids:
-            // '⭐' U-2B50 is mapped to Ids [2928, 99834] in the Tiktoken model.
-            // In other words, the character '⭐' has UTF-8 code point 0xE2, 0xAD, 0x90, Tiktoken will map 0xE2 to [2928] and 0xAD, 0x90 to [99834].
 
+            // Tiktoken doesn't guarantee a one-to-one correspondence between IDs and UTF-16 words.
+            // Consequently, decoding individual IDs into UTF-16 string is not supported; instead, decoding all IDs must be performed collectively.
+            // Here's an example case that maps one character to multiple IDs:
+            // '⭐' U-2B50 is mapped to IDs [2928, 99834] in the Tiktoken model.
+            // In other words, the character '⭐' with UTF-8 code point 0xE2, 0xAD, 0x90 will be mapped by Tiktoken as follows: 0xE2 to [2928]
+            // and 0xAD, 0x90 to [99834]. Decoding 2928 and 99834 individually won't reconstruct the original UTF-16 string '⭐' U-2B50;
+            // decoding all IDs together is required to get the expected result.
             if (ids is null)
             {
                 return null;