diff --git a/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs b/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs new file mode 100644 index 0000000000..f13bb7260e --- /dev/null +++ b/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs @@ -0,0 +1,96 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Learners; + +namespace Microsoft.ML.Benchmarks +{ + public class KMeansAndLogisticRegressionBench + { + private static string s_dataPath; + + [Benchmark] + public IPredictor TrainKMeansAndLR() => TrainKMeansAndLRCore(); + + [GlobalSetup] + public void Setup() + { + s_dataPath = Program.GetDataPath("adult.train"); + } + + private static IPredictor TrainKMeansAndLRCore() + { + string dataPath = s_dataPath; + + using (var env = new TlcEnvironment(seed: 1)) + { + // Pipeline + var loader = new TextLoader(env, + new TextLoader.Arguments() + { + HasHeader = true, + Separator = ",", + Column = new[] { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min = 14, Max = 14} }, + Type = DataKind.R4 + }, + new TextLoader.Column() + { + Name = "CatFeatures", + Source = new [] { + new TextLoader.Range() { Min = 1, Max = 1 }, + new TextLoader.Range() { Min = 3, Max = 3 }, + new TextLoader.Range() { Min = 5, Max = 9 }, + new TextLoader.Range() { Min = 13, Max = 13 } + }, + Type = DataKind.TX + }, + new TextLoader.Column() + { + Name = "NumFeatures", + Source = new [] { + new TextLoader.Range() { Min = 0, Max = 0 }, + new TextLoader.Range() { Min = 2, Max = 2 }, + new TextLoader.Range() { Min = 4, Max = 4 }, + new TextLoader.Range() { Min = 10, Max = 12 } + }, + Type = DataKind.R4 + } + } + }, new MultiFileSource(dataPath)); + + IDataTransform trans = CategoricalTransform.Create(env, new CategoricalTransform.Arguments + { + Column = new[] + { + new CategoricalTransform.Column { Name = "CatFeatures", Source = "CatFeatures" } + } + }, loader); + + trans = NormalizeTransform.CreateMinMaxNormalizer(env, trans, "NumFeatures"); + trans = new ConcatTransform(env, trans, "Features", "NumFeatures", "CatFeatures"); + trans = TrainAndScoreTransform.Create(env, new TrainAndScoreTransform.Arguments + { + Trainer = new SubComponent("KMeans", "k=100"), + FeatureColumn = "Features" + }, trans); + trans = new ConcatTransform(env, trans, "Features", "Features", "Score"); + + // Train + var trainer = new LogisticRegression(env, new LogisticRegression.Arguments() { EnforceNonNegativity = true, OptTol = 1e-3f }); + var trainRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); + return trainer.Train(trainRoles); + } + } + } +} diff --git a/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj b/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj index 7f0af862e0..5a9f3e7467 100644 --- a/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj +++ b/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj @@ -13,6 +13,7 @@ + diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index a01cf8f613..ce8cb1aad0 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -4,9 +4,11 @@ using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Engines; -using Microsoft.ML.Data; using Microsoft.ML.Models; +using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Learners; using Microsoft.ML.Trainers; using Microsoft.ML.Transforms; using System; @@ -19,6 +21,7 @@ public class StochasticDualCoordinateAscentClassifierBench internal static ClassificationMetrics s_metrics; private static PredictionModel s_trainedModel; private static string s_dataPath; + private static string s_sentimentDataPath; private static IrisData[][] s_batches; private static readonly int[] s_batchSizes = new int[] { 1, 2, 5 }; private readonly Random r = new Random(0); @@ -35,10 +38,11 @@ public class StochasticDualCoordinateAscentClassifierBench public void Setup() { s_dataPath = Program.GetDataPath("iris.txt"); + s_sentimentDataPath = Program.GetDataPath("wikipedia-detox-250-line-data.tsv"); s_trainedModel = TrainCore(); IrisPrediction prediction = s_trainedModel.Predict(s_example); - var testData = new TextLoader(s_dataPath).CreateFrom(useHeader: true); + var testData = new Data.TextLoader(s_dataPath).CreateFrom(useHeader: true); var evaluator = new ClassificationEvaluator(); s_metrics = evaluator.Evaluate(s_trainedModel, testData); @@ -69,6 +73,9 @@ public void Setup() [Benchmark] public void PredictIrisBatchOf5() => Consume(s_trainedModel.Predict(s_batches[2])); + [Benchmark] + public IPredictor TrainSentiment() => TrainSentimentCore(); + private void Consume(IEnumerable predictions) { foreach (var prediction in predictions) @@ -79,7 +86,7 @@ private static PredictionModel TrainCore() { var pipeline = new LearningPipeline(); - pipeline.Add(new TextLoader(s_dataPath).CreateFrom(useHeader: true)); + pipeline.Add(new Data.TextLoader(s_dataPath).CreateFrom(useHeader: true)); pipeline.Add(new ColumnConcatenator(outputColumn: "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")); @@ -89,6 +96,76 @@ private static PredictionModel TrainCore() return model; } + private static IPredictor TrainSentimentCore() + { + var dataPath = s_sentimentDataPath; + using (var env = new TlcEnvironment(seed: 1)) + { + // Pipeline + var loader = new TextLoader(env, + new TextLoader.Arguments() + { + AllowQuoting = false, + AllowSparse = false, + Separator = "tab", + HasHeader = true, + Column = new[] + { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min=0, Max=0} }, + Type = DataKind.Num + }, + + new TextLoader.Column() + { + Name = "SentimentText", + Source = new [] { new TextLoader.Range() { Min=1, Max=1} }, + Type = DataKind.Text + } + } + }, new MultiFileSource(dataPath)); + + var text = TextTransform.Create(env, + new TextTransform.Arguments() + { + Column = new TextTransform.Column + { + Name = "WordEmbeddings", + Source = new[] { "SentimentText" } + }, + KeepDiacritics = false, + KeepPunctuations = false, + TextCase = Runtime.TextAnalytics.TextNormalizerTransform.CaseNormalizationMode.Lower, + OutputTokens = true, + StopWordsRemover = new Runtime.TextAnalytics.PredefinedStopWordsRemoverFactory(), + VectorNormalizer = TextTransform.TextNormKind.None, + CharFeatureExtractor = null, + WordFeatureExtractor = null, + }, loader); + + var trans = new WordEmbeddingsTransform(env, + new WordEmbeddingsTransform.Arguments() + { + Column = new WordEmbeddingsTransform.Column[1] + { + new WordEmbeddingsTransform.Column + { + Name = "Features", + Source = "WordEmbeddings_TransformedText" + } + }, + ModelKind = WordEmbeddingsTransform.PretrainedModelKind.Sswe, + }, text); + + // Train + var trainer = new SdcaMultiClassTrainer(env, new SdcaMultiClassTrainer.Arguments() { MaxIterations = 20 }); + var trainRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); + return trainer.Train(trainRoles); + } + } + public class IrisData { [Column("0")]