Skip to content

Commit

Permalink
Polish word-level tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
wschin committed Mar 11, 2019
1 parent 305b2a6 commit 145bf74
Show file tree
Hide file tree
Showing 15 changed files with 32 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ public static void Example()
// making use of default settings.
string defaultColumnName = "DefaultKeys";
// REVIEW create through the catalog extension
var default_pipeline = ml.Transforms.Text.TokenizeWords("Review")
var default_pipeline = ml.Transforms.Text.ProduceWordTokens("Review")
.Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));

// Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
// We can change the maxNumTerm to limit how many keys will get generated out of the set of words,
// and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered)
// to value/alphabetically.
string customizedColumnName = "CustomizedKeys";
var customized_pipeline = ml.Transforms.Text.TokenizeWords("Review")
var customized_pipeline = ml.Transforms.Text.ProduceWordTokens("Review")
.Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value));

// The transformed data.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public static void Example()

// Let's take SentimentText column and break it into vector of words.
string originalTextColumnName = "Words";
var words = ml.Transforms.Text.TokenizeWords("SentimentText", originalTextColumnName);
var words = ml.Transforms.Text.ProduceWordTokens("SentimentText", originalTextColumnName);

// Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages.
var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public static void Example()
j.Features = features;
};

var engine = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text")
var engine = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text")
.Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new ColumnOptions[] { ("VariableLenghtFeatures", "TokenizedWords") }))
.Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize"))
.Append(tensorFlowModel.ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" }))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public static void Example()

// Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords.
var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
.Append(ml.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
.Append(ml.Transforms.Text.ProduceWordTokens("Words", "NormalizedText"))
.Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));

var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData);
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public Reconciler(char[] separators)
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
public static VarVector<string> TokenizeText(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
public static VarVector<string> ProduceWordTokens(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
}

/// <summary>
Expand Down
15 changes: 2 additions & 13 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -160,29 +160,18 @@ public static class TextCatalog
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string inputColumnName = null,
char[] separators = null)
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, separators);

/// <summary>
/// Tokenizes incoming text in input columns and outputs the tokens using <paramref name="separators"/> as separators.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
(string outputColumnName, string inputColumnName)[] columns,
char[] separators = null)
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, separators);

/// <summary>
/// Tokenizes incoming text in input columns, using per-column configurations, and outputs the tokens.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog,
params WordTokenizingEstimator.ColumnOptions[] columns)
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);

Expand Down
12 changes: 12 additions & 0 deletions src/Microsoft.ML.Transforms/Text/WordTokenizing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -441,9 +441,21 @@ internal WordTokenizingEstimator(IHostEnvironment env, params ColumnOptions[] co
}
public sealed class ColumnOptions
{
/// <summary>
/// Output column name that will be used to store the tokenization result of <see cref="InputColumnName"/> column.
/// </summary>
public readonly string Name;
/// <summary>
/// Input column name that will be tokenized into words.
/// </summary>
public readonly string InputColumnName;
/// <summary>
/// Seperator list used to tokenize input string. If not specified, space will be used.
/// </summary>
public IReadOnlyList<char> Separators => SeparatorsArray;
/// <summary>
/// State of <see cref="Separators"/>. Since <see langword="char"/>[] is multable, it's not safe to directly expose this field to users.
/// </summary>
internal readonly char[] SeparatorsArray;

/// <summary>
Expand Down
8 changes: 4 additions & 4 deletions test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ public void Tokenize()
var est = data.MakeNewEstimator()
.Append(r => (
r.label,
tokens: r.text.TokenizeText(),
tokens: r.text.ProduceWordTokens(),
chars: r.text.ProduceCharacterTokens()));

var tdata = est.Fit(data).Transform(data);
Expand Down Expand Up @@ -547,7 +547,7 @@ public void NormalizeTextAndRemoveStopWords()
.Append(r => (
r.label,
normalized_text: r.text.NormalizeText(),
words_without_stopwords: r.text.TokenizeText().RemoveStopwords()));
words_without_stopwords: r.text.ProduceWordTokens().RemoveStopwords()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;
Expand Down Expand Up @@ -604,8 +604,8 @@ public void Ngrams()
var est = data.MakeNewEstimator()
.Append(r => (
r.label,
ngrams: r.text.TokenizeText().ToKey().ToNgrams(),
ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash()));
ngrams: r.text.ProduceWordTokens().ToKey().ToNgrams(),
ngramshash: r.text.ProduceWordTokens().ToKey().ToNgramsHash()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ private void TextFeaturizationOn(string dataPath)
// NLP pipeline 4: word embeddings.
// PretrainedModelKind.Sswe is used here for performance of the test. In a real
// scenario, it is best to use a different model for more accuracy.
Embeddings: r.Message.NormalizeText().TokenizeText().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
Embeddings: r.Message.NormalizeText().ProduceWordTokens().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
));

// Let's train our pipeline, and then apply it to the same data.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ private void TextFeaturizationOn(string dataPath)
// NLP pipeline 4: word embeddings.
// PretrainedModelKind.Sswe is used here for performance of the test. In a real
// scenario, it is best to use a different model for more accuracy.
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
.Append(mlContext.Transforms.Text.ProduceWordTokens("TokenizedMessage", "NormalizedMessage"))
.Append(mlContext.Transforms.Text.ApplyWordEmbedding("Embeddings", "TokenizedMessage",
WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,7 @@ public void TensorFlowSentimentClassificationTest()
// The first pipeline 'dataPipe' tokenzies the string into words and maps each word to an integer which is an index in the dictionary.
// Then this integer vector is retrieved from the pipeline and resized to fixed length.
// The second pipeline 'tfEnginePipe' takes the resized integer vector and passes it to TensoFlow and gets the classification scores.
var estimator = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text")
var estimator = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text")
.Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new ColumnOptions[] { ("Features", "TokenizedWords") }));
var dataPipe = estimator.Fit(dataView)
.CreatePredictionEngine<TensorFlowSentiment, TensorFlowSentiment>(mlContext);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public void CategoricalHashStatic()
row.ScalarString,
row.VectorString,
// Create a VarVector column
VarVectorString: row.ScalarString.TokenizeText())).
VarVectorString: row.ScalarString.ProduceWordTokens())).
Append(row => (
A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind),
B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ public void TextNormalizationAndStopwordRemoverWorkout()
text: ctx.LoadFloat(1)), hasHeader: true)
.Load(sentimentDataPath);
var est = ML.Transforms.Text.NormalizeText("text")
.Append(ML.Transforms.Text.TokenizeWords("words", "text"))
.Append(ML.Transforms.Text.ProduceWordTokens("words", "text"))
.Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words"))
.Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY"));

Expand Down
2 changes: 1 addition & 1 deletion test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ public void ValueMappingInputIsVectorWorkout()
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() };
var values = new List<int>() { 1, 2, 3, 4 };

var est = ML.Transforms.Text.TokenizeWords("TokenizeB", "B")
var est = ML.Transforms.Text.ProduceWordTokens("TokenizeB", "B")
.Append(ML.Transforms.Conversion.ValueMap(keys, values, new ColumnOptions[] { ("VecB", "TokenizeB") }));
TestEstimatorCore(est, validFitInput: dataView, invalidInput: badDataView);
}
Expand Down
4 changes: 2 additions & 2 deletions test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public void TestWordEmbeddings()
}).Load(GetDataPath(dataPath));

var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
.Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
.Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText"))
.Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
var words = est.Fit(data).Transform(data);

Expand Down Expand Up @@ -70,7 +70,7 @@ public void TestCustomWordEmbeddings()
}).Load(GetDataPath(dataPath));

var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
.Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
.Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText"))
.Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
var words = est.Fit(data).Transform(data);
var pathToCustomModel = DeleteOutputPath("custommodel.txt");
Expand Down

0 comments on commit 145bf74

Please sign in to comment.