diff --git a/src/Microsoft.ML/Models/ClassificationMetrics.cs b/src/Microsoft.ML/Models/ClassificationMetrics.cs index 6c1c139278..d7bde3dd08 100644 --- a/src/Microsoft.ML/Models/ClassificationMetrics.cs +++ b/src/Microsoft.ML/Models/ClassificationMetrics.cs @@ -15,6 +15,7 @@ namespace Microsoft.ML.Models /// public sealed class ClassificationMetrics { + public static ClassificationMetrics Empty = new ClassificationMetrics(); private ClassificationMetrics() { } diff --git a/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs b/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs new file mode 100644 index 0000000000..1cf302c265 --- /dev/null +++ b/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs @@ -0,0 +1,121 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Learners; + +namespace Microsoft.ML.Benchmarks +{ + public class KMeansAndLogisticRegressionBench + { + private static string s_dataPath; + + [Benchmark] + public IPredictor TrainKMeansAndLR() => TrainKMeansAndLRCore(); + + [GlobalSetup] + public void Setup() + { + s_dataPath = Program.GetDataPath("adult.train"); + StochasticDualCoordinateAscentClassifierBench.s_metrics = Models.ClassificationMetrics.Empty; + } + + private static IPredictor TrainKMeansAndLRCore() + { + string dataPath = s_dataPath; + + using (var env = new TlcEnvironment(seed: 1)) + { + // Pipeline + var loader = new TextLoader(env, + new TextLoader.Arguments() + { + HasHeader = true, + Separator = ",", + Column = new[] { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min = 14, Max = 14} }, + Type = DataKind.R4 + }, + new TextLoader.Column() + { + Name = "CatFeatures", + Source = new [] { + new TextLoader.Range() { Min = 1, Max = 1 }, + new TextLoader.Range() { Min = 3, Max = 3 }, + new TextLoader.Range() { Min = 5, Max = 9 }, + new TextLoader.Range() { Min = 13, Max = 13 } + }, + Type = DataKind.TX + }, + new TextLoader.Column() + { + Name = "NumFeatures", + Source = new [] { + new TextLoader.Range() { Min = 0, Max = 0 }, + new TextLoader.Range() { Min = 2, Max = 2 }, + new TextLoader.Range() { Min = 4, Max = 4 }, + new TextLoader.Range() { Min = 10, Max = 12 } + }, + Type = DataKind.R4 + } + } + }, new MultiFileSource(dataPath)); + + IDataTransform trans = CategoricalTransform.Create(env, new CategoricalTransform.Arguments + { + Column = new[] + { + new CategoricalTransform.Column { Name = "CatFeatures", Source = "CatFeatures" } + } + }, loader); + + trans = NormalizeTransform.CreateMinMaxNormalizer(env, trans, "NumFeatures"); + trans = new ConcatTransform(env, trans, "Features", "NumFeatures", "CatFeatures"); + trans = TrainAndScoreTransform.Create(env, new TrainAndScoreTransform.Arguments + { + Trainer = new SubComponent("KMeans", "k=100"), + FeatureColumn = "Features" + }, trans); + trans = new ConcatTransform(env, trans, "Features", "Features", "Score"); + + // Train + var trainer = new LogisticRegression(env, new LogisticRegression.Arguments() { EnforceNonNegativity = true, OptTol = 1e-3f }); + var trainRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); + return trainer.Train(trainRoles); + } + } + + public class IrisData + { + [Column("0")] + public float Label; + + [Column("1")] + public float SepalLength; + + [Column("2")] + public float SepalWidth; + + [Column("3")] + public float PetalLength; + + [Column("4")] + public float PetalWidth; + } + + public class IrisPrediction + { + [ColumnName("Score")] + public float[] PredictedLabels; + } + } +} diff --git a/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj b/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj index 7f0af862e0..5a9f3e7467 100644 --- a/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj +++ b/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj @@ -13,6 +13,7 @@ + diff --git a/test/Microsoft.ML.Benchmarks/Program.cs b/test/Microsoft.ML.Benchmarks/Program.cs index 0b4e9edc52..a5634fd08d 100644 --- a/test/Microsoft.ML.Benchmarks/Program.cs +++ b/test/Microsoft.ML.Benchmarks/Program.cs @@ -28,9 +28,10 @@ static void Main(string[] args) private static IConfig CreateCustomConfig() => DefaultConfig.Instance .With(Job.Default + .WithWarmupCount(1) // 1 warmup iteration is enough for the benchmarks we have here .WithMaxIterationCount(20) .With(InProcessToolchain.Instance)) - .With(new ClassificationMetricsColumn("AccuracyMacro", "Macro-average accuracy of the model")) + .With(new ClassificationMetricsColumn(nameof(ClassificationMetrics.AccuracyMacro), "Macro-average accuracy of the model")) .With(MemoryDiagnoser.Default); internal static string GetDataPath(string name) diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index a01cf8f613..46259468d4 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -4,13 +4,17 @@ using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Engines; -using Microsoft.ML.Data; using Microsoft.ML.Models; +using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Learners; using Microsoft.ML.Trainers; using Microsoft.ML.Transforms; using System; using System.Collections.Generic; +using System.Globalization; +using System.Threading; namespace Microsoft.ML.Benchmarks { @@ -19,6 +23,7 @@ public class StochasticDualCoordinateAscentClassifierBench internal static ClassificationMetrics s_metrics; private static PredictionModel s_trainedModel; private static string s_dataPath; + private static string s_sentimentDataPath; private static IrisData[][] s_batches; private static readonly int[] s_batchSizes = new int[] { 1, 2, 5 }; private readonly Random r = new Random(0); @@ -31,14 +36,18 @@ public class StochasticDualCoordinateAscentClassifierBench PetalWidth = 5.1f, }; + [Benchmark] + public IPredictor TrainSentiment() => TrainSentimentCore(); + [GlobalSetup] public void Setup() { s_dataPath = Program.GetDataPath("iris.txt"); + s_sentimentDataPath = Program.GetDataPath("wikipedia-detox-250-line-data.tsv"); s_trainedModel = TrainCore(); IrisPrediction prediction = s_trainedModel.Predict(s_example); - var testData = new TextLoader(s_dataPath).CreateFrom(useHeader: true); + var testData = new Data.TextLoader(s_dataPath).CreateFrom(useHeader: true); var evaluator = new ClassificationEvaluator(); s_metrics = evaluator.Evaluate(s_trainedModel, testData); @@ -79,7 +88,7 @@ private static PredictionModel TrainCore() { var pipeline = new LearningPipeline(); - pipeline.Add(new TextLoader(s_dataPath).CreateFrom(useHeader: true)); + pipeline.Add(new Data.TextLoader(s_dataPath).CreateFrom(useHeader: true)); pipeline.Add(new ColumnConcatenator(outputColumn: "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")); @@ -89,6 +98,77 @@ private static PredictionModel TrainCore() return model; } + private static IPredictor TrainSentimentCore() + { + Thread.CurrentThread.CurrentCulture = CultureInfo.InvariantCulture; // the input file contains numbers with `.` decimal separator + + var dataPath = s_sentimentDataPath; + using (var env = new TlcEnvironment(seed: 1)) + { + // Pipeline + var loader = new TextLoader(env, + new TextLoader.Arguments() + { + AllowQuoting = false, + AllowSparse = false, + Separator = "tab", + HasHeader = true, + Column = new[] + { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min=0, Max=0} }, + Type = DataKind.Num + }, + + new TextLoader.Column() + { + Name = "SentimentText", + Source = new [] { new TextLoader.Range() { Min=1, Max=1} }, + Type = DataKind.Text + } + } + }, new MultiFileSource(dataPath)); + + var text = TextTransform.Create(env, new TextTransform.Arguments() + { + Column = new TextTransform.Column + { + Name = "WordEmbeddings", + Source = new[] { "SentimentText" } + }, + KeepDiacritics = false, + KeepPunctuations = false, + TextCase = Runtime.TextAnalytics.TextNormalizerTransform.CaseNormalizationMode.Lower, + OutputTokens = true, + StopWordsRemover = new Runtime.TextAnalytics.PredefinedStopWordsRemoverFactory(), + VectorNormalizer = TextTransform.TextNormKind.None, + CharFeatureExtractor = null, + WordFeatureExtractor = null, + }, + loader); + + var trans = new WordEmbeddingsTransform(env, new WordEmbeddingsTransform.Arguments() + { + Column = new WordEmbeddingsTransform.Column[1] + { + new WordEmbeddingsTransform.Column + { + Name = "Features", + Source = "WordEmbeddings_TransformedText" + } + }, + ModelKind = WordEmbeddingsTransform.PretrainedModelKind.Sswe, + }, text); + // Train + var trainer = new SdcaMultiClassTrainer(env, new SdcaMultiClassTrainer.Arguments() { MaxIterations = 20 }); + + var trainRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); + return trainer.Train(trainRoles); + } + } + public class IrisData { [Column("0")]