Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Polish char- and word-level tokenizers & stopword removers #2916

Merged
merged 7 commits into from
Mar 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/code/MlNetCookBook.md
Expand Up @@ -775,12 +775,12 @@ var pipeline =
ngramLength: 2, useAllLengths: false))

// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
.Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message"))
.Append(mlContext.Transforms.Text.ProduceCharactersAsKeys("MessageChars", "Message"))
.Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars",
ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf))

// NLP pipeline 4: word embeddings.
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
.Append(mlContext.Transforms.Text.ProduceWordTokens("TokenizedMessage", "NormalizedMessage"))
.Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage",
WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));

Expand Down
Expand Up @@ -30,15 +30,15 @@ public static void Example()
// making use of default settings.
string defaultColumnName = "DefaultKeys";
// REVIEW create through the catalog extension
var default_pipeline = ml.Transforms.Text.TokenizeWords("Review")
var default_pipeline = ml.Transforms.Text.TokenizeIntoWords("Review")
.Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));

// Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
// We can change the maximumNumberOfKeys to limit how many keys will get generated out of the set of words,
// and condition the order in which they get evaluated by changing keyOrdinality from the default ByOccurence (order in which they get encountered)
// to value/alphabetically.
string customizedColumnName = "CustomizedKeys";
var customized_pipeline = ml.Transforms.Text.TokenizeWords("Review")
var customized_pipeline = ml.Transforms.Text.TokenizeIntoWords("Review")
.Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maximumNumberOfKeys: 10, keyOrdinality: ValueToKeyMappingEstimator.KeyOrdinality.ByValue));

// The transformed data.
Expand Down
Expand Up @@ -26,7 +26,7 @@ public static void NgramTransform()
// A pipeline to tokenize text as characters and then combine them together into ngrams
// The pipeline uses the default settings to featurize.

var charsPipeline = ml.Transforms.Text.TokenizeCharacters("Chars", "SentimentText", useMarkerCharacters: false);
var charsPipeline = ml.Transforms.Text.TokenizeIntoCharactersAsKeys("Chars", "SentimentText", useMarkerCharacters: false);
var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1);
var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars");
var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline);
Expand Down
Expand Up @@ -25,7 +25,7 @@ public static void Example()

// Let's take SentimentText column and break it into vector of words.
string originalTextColumnName = "Words";
var words = ml.Transforms.Text.TokenizeWords("SentimentText", originalTextColumnName);
var words = ml.Transforms.Text.TokenizeIntoWords("SentimentText", originalTextColumnName);

// Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages.
var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover"));
Expand Down
Expand Up @@ -68,7 +68,7 @@ public static void Example()
j.Features = features;
};

var engine = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text")
var engine = mlContext.Transforms.Text.TokenizeIntoWords("TokenizedWords", "Sentiment_Text")
.Append(mlContext.Transforms.Conversion.MapValue(lookupMap, "Words", "Ids", new ColumnOptions[] { ("VariableLenghtFeatures", "TokenizedWords") }))
.Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize"))
.Append(tensorFlowModel.ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" }))
Expand Down
Expand Up @@ -26,7 +26,7 @@ public static void Example()

// Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords.
var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
.Append(ml.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
.Append(ml.Transforms.Text.TokenizeIntoWords("Words", "NormalizedText"))
.Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));

var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData);
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs
Expand Up @@ -55,7 +55,7 @@ public Reconciler(char[] separators)
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
public static VarVector<string> TokenizeText(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
public static VarVector<string> TokenizeIntoWords(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
}

/// <summary>
Expand Down Expand Up @@ -109,7 +109,7 @@ public bool Equals(Reconciler other)
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
public static VarVector<Key<ushort, string>> TokenizeIntoCharacters(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
public static VarVector<Key<ushort, string>> TokenizeIntoCharactersAsKeys(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
}

/// <summary>
Expand Down Expand Up @@ -162,8 +162,8 @@ public bool Equals(Reconciler other)
/// Remove stop words from incoming text.
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="language">Langauge of the input text.</param>
public static VarVector<string> RemoveStopwords(this VarVector<string> input,
/// <param name="language">Langauge of the input text. It will be used to retrieve a built-in stopword list.</param>
public static VarVector<string> RemoveDefaultStopWords(this VarVector<string> input,
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) => new OutPipelineColumn(input, language);
}

Expand Down
Expand Up @@ -133,7 +133,7 @@ private static VersionInfo GetVersionInfo()
/// <summary>
/// Defines the behavior of the transformer.
/// </summary>
public IReadOnlyCollection<StopWordsRemovingEstimator.ColumnOptions> Columns => _columns.AsReadOnly();
internal IReadOnlyCollection<StopWordsRemovingEstimator.ColumnOptions> Columns => _columns.AsReadOnly();

private readonly StopWordsRemovingEstimator.ColumnOptions[] _columns;
private static volatile NormStr.Pool[] _stopWords;
Expand Down Expand Up @@ -828,7 +828,7 @@ private static VersionInfo GetStopwordsManagerVersionInfo()
/// <summary>
/// The names of the input output column pairs on which this transformation is applied.
/// </summary>
public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();
internal IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();

/// <summary>
/// Custom stopword remover removes specified list of stop words.
Expand Down
61 changes: 8 additions & 53 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Expand Up @@ -55,8 +55,9 @@ public static class TextCatalog
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog,
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string inputColumnName = null,
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters)
Expand All @@ -67,10 +68,11 @@ public static class TextCatalog
/// Tokenize incoming text in input columns and output the tokens as output columns.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>

public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog,
public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters,
params ColumnOptions[] columns)
=> new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns));
Expand Down Expand Up @@ -157,29 +159,18 @@ public static class TextCatalog
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string inputColumnName = null,
char[] separators = null)
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, separators);

/// <summary>
/// Tokenizes incoming text in input columns and outputs the tokens using <paramref name="separators"/> as separators.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
(string outputColumnName, string inputColumnName)[] columns,
char[] separators = null)
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, separators);

/// <summary>
/// Tokenizes incoming text in input columns, using per-column configurations, and outputs the tokens.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog,
params WordTokenizingEstimator.ColumnOptions[] columns)
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);

Expand Down Expand Up @@ -243,24 +234,6 @@ public static class TextCatalog
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English)
=> new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, language);

/// <summary>
/// Removes stop words from incoming token streams in input columns
/// and outputs the token streams without stop words as output columns.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to remove stop words on.</param>
/// <param name="language">Langauge of the input text columns <paramref name="columns"/>.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
/// ]]></format>
/// </example>
public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsCatalog.TextTransforms catalog,
(string outputColumnName, string inputColumnName)[] columns,
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English)
=> new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, language);

/// <summary>
/// Removes stop words from incoming token streams in <paramref name="inputColumnName"/>
/// and outputs the token streams without stopwords as <paramref name="outputColumnName"/>.
Expand All @@ -281,24 +254,6 @@ public static class TextCatalog
params string[] stopwords)
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords);

/// <summary>
/// Removes stop words from incoming token streams in input columns
/// and outputs the token streams without stop words as output columns.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to remove stop words on.</param>
/// <param name="stopwords">Array of words to remove.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
/// ]]></format>
/// </example>
public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCatalog.TextTransforms catalog,
(string outputColumnName, string inputColumnName)[] columns,
params string[] stopwords)
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, stopwords);

/// <summary>
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
/// and outputs bag of word vector as <paramref name="outputColumnName"/>
Expand Down
12 changes: 8 additions & 4 deletions src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs
Expand Up @@ -102,7 +102,8 @@ private static VersionInfo GetVersionInfo()
/// Tokenize incoming text in input columns and output the tokens as output columns.
/// </summary>
/// <param name="env">The environment.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
internal TokenizingByCharactersTransformer(IHostEnvironment env, bool useMarkerCharacters = TokenizingByCharactersEstimator.Defaults.UseMarkerCharacters,
params (string outputColumnName, string inputColumnName)[] columns) :
Expand All @@ -114,7 +115,7 @@ private static VersionInfo GetVersionInfo()
/// <summary>
/// The names of the output and input column pairs on which the transformation is applied.
/// </summary>
public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();
internal IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();

private protected override void CheckInputColumn(DataViewSchema inputSchema, int col, int srcCol)
{
Expand Down Expand Up @@ -555,6 +556,7 @@ internal static class Defaults
{
public const bool UseMarkerCharacters = true;
}

internal static bool IsColumnTypeValid(DataViewType type) => type.GetItemType() is TextDataViewType;

internal const string ExpectedColumnType = "Text";
Expand All @@ -565,7 +567,8 @@ internal static class Defaults
/// <param name="env">The environment.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
internal TokenizingByCharactersEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null,
bool useMarkerCharacters = Defaults.UseMarkerCharacters)
: this(env, useMarkerCharacters, new[] { (outputColumnName, inputColumnName ?? outputColumnName) })
Expand All @@ -576,7 +579,8 @@ internal static class Defaults
/// Tokenize incoming text in input columns and output the tokens as output columns.
/// </summary>
/// <param name="env">The environment.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>

internal TokenizingByCharactersEstimator(IHostEnvironment env, bool useMarkerCharacters = Defaults.UseMarkerCharacters,
Expand Down