Skip to content

Commit

Permalink
Adding needed Tokenizer's APIs (#7047)
Browse files Browse the repository at this point in the history
* Adding needed Tokenizer's APIs

* Address the feedback

* Small update to the newly exposed APIs

* fix comments

* Update the APIs signatures

* More feedback addressing

* Fix the comments
  • Loading branch information
tarekgh committed Mar 7, 2024
1 parent 8d31a8e commit bad8298
Show file tree
Hide file tree
Showing 11 changed files with 541 additions and 363 deletions.
91 changes: 0 additions & 91 deletions src/Microsoft.ML.Tokenizers/AddedToken.cs

This file was deleted.

14 changes: 7 additions & 7 deletions src/Microsoft.ML.Tokenizers/Model/BPE.cs
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ private Bpe(Stream vocabStream, Stream? mergesStream, string? unknownToken, stri
/// <summary>
/// Encode a text string to a list of tokens.
/// </summary>
/// <param name="text">The text to encode.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
/// <returns>The list of tokens generated from the text tokenization.</returns>
public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false)
{
Expand All @@ -192,17 +192,17 @@ public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = f
/// <summary>
/// Encode a split text string to a list of Ids and add them to the accumulatedIds list.
/// </summary>
/// <param name="text">The text to split.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
/// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
public override void EncodeToIds(ReadOnlySpan<char> text, bool isSpecialToken, IList<int> accumulatedIds) => EncodeToIdsWithCache(text, accumulatedIds);

/// <summary>
/// Get the number of tokens that the input text will be encoded to.
/// </summary>
/// <param name="text">The text to encode.</param>
/// <param name="isSpecialToken">Indicate if the token is special token.</param>
/// <returns>The number of tokens that the input text will be encoded to.</returns>
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
/// <returns>The number of tokens that the input text will be encoded to. This parameter is ignored in this model.</returns>
public override int CountTokens(ReadOnlySpan<char> text, bool isSpecialToken) => EncodeToIdsWithCache(text, null);

/// <summary>
Expand Down
12 changes: 6 additions & 6 deletions src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ public EnglishRoberta(Stream vocabularyStream, Stream mergeStream, Stream highes
/// <summary>
/// Encode a text string to a list of tokens.
/// </summary>
/// <param name="text">The text to encode.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
/// <returns>The list of tokens generated from the text tokenization.</returns>
public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false)
{
Expand Down Expand Up @@ -224,16 +224,16 @@ public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = f
/// <summary>
/// Encode a split text string to a list of Ids and add them to the accumulatedIds list.
/// </summary>
/// <param name="text">The text to split.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
/// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
public override void EncodeToIds(ReadOnlySpan<char> text, bool isSpecialToken, IList<int> accumulatedIds) => EncodeToIds(text, accumulatedIds);

/// <summary>
/// Get the number of tokens that the input text will be encoded to.
/// </summary>
/// <param name="text">The text to encode.</param>
/// <param name="isSpecialToken">Indicate if the token is special token.</param>
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
/// <returns>The number of tokens that the input text will be encoded to.</returns>
public override int CountTokens(ReadOnlySpan<char> text, bool isSpecialToken) => EncodeToIds(text, null);

Expand Down
12 changes: 6 additions & 6 deletions src/Microsoft.ML.Tokenizers/Model/Model.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ public abstract class Model
/// <summary>
/// Encode a text to a list of tokens.
/// </summary>
/// <param name="text">The text to encode.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token.</param>
/// <returns>The list of tokens generated from the text tokenization.</returns>
public abstract IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false);

/// <summary>
/// Encode a text to a list of Ids and add them to the accumulatedIds list.
/// </summary>
/// <param name="text">The text to encode.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token.</param>
/// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
/// <remarks>
/// This method does the default implementation that uses the Encode method to get the token's Ids.
Expand All @@ -49,8 +49,8 @@ public virtual void EncodeToIds(ReadOnlySpan<char> text, bool isSpecialToken, IL
/// <summary>
/// Get the number of tokens that the input text will be encoded to.
/// </summary>
/// <param name="text">The text to encode.</param>
/// <param name="isSpecialToken">Indicate if the token is special token.</param>
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token.</param>
/// <returns>The number of tokens that the input text will be encoded to.</returns>
/// <remarks>
/// This method does the default implementation that uses the EncodeToIds method to get the number of token's Ids.
Expand Down

0 comments on commit bad8298

Please sign in to comment.