dotnet · Ivanidzo4ka · Aug 20, 2018 · Aug 13, 2018 · Aug 13, 2018 · Aug 14, 2018
diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/CrossValidation.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/CrossValidation.cs
@@ -0,0 +1,40 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Data;
+using Microsoft.ML.Models;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Transforms;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.PipelineApi
+{
+    public partial class PipelineApiScenarioTests
+    {
+        /// <summary>
+        /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with
+        /// a data source (optionally with stratification column), come up with an instantiable transform
+        /// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate
+        /// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of
+        /// evaluations and optionally trained pipes. (People always want metrics out of xfold,
+        /// they sometimes want the actual models too.)
+        /// </summary>
+        [Fact]
+        void CrossValidation()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+
+            var pipeline = new LearningPipeline();
+            pipeline.Add(new TextLoader(dataPath).CreateFrom<SentimentData>());
+            pipeline.Add(MakeSentimentTextTransform());
+            pipeline.Add(new FastTreeBinaryClassifier() { NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2 });
+            pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" });
+
+            var cv = new CrossValidator().CrossValidate<SentimentData, SentimentPrediction>(pipeline);
+            var metrics = cv.BinaryClassificationMetrics[0];
+            var singlePrediction = cv.PredictorModels[0].Predict(new SentimentData() { SentimentText = "Not big fan of this." });
+            Assert.True(singlePrediction.Sentiment);
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/Evaluation.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/Evaluation.cs
@@ -0,0 +1,40 @@
+using Microsoft.ML.Data;
+using Microsoft.ML.Models;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Transforms;
+using System;
+using System.Collections.Generic;
+using System.Text;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.PipelineApi
+{
+    public partial class PipelineApiScenarioTests
+    {
+        /// <summary>
+        /// Evaluation: Similar to the simple train scenario, except instead of having some 
+        /// predictive structure, be able to score another "test" data file, run the result 
+        /// through an evaluator and get metrics like AUC, accuracy, PR curves, and whatnot. 
+        /// Getting metrics out of this shoudl be as straightforward and unannoying as possible.
+        /// </summary>
+        [Fact]
+        public void Evaluation()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentDataPath);
+            var pipeline = new LearningPipeline();
+
+            pipeline.Add(new TextLoader(dataPath).CreateFrom<SentimentData>());
+            pipeline.Add(MakeSentimentTextTransform());
+            pipeline.Add(new FastTreeBinaryClassifier() { NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2 });
+            pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" });
+            var model = pipeline.Train<SentimentData, SentimentPrediction>();
+            var testLearningPipelineItem = new TextLoader(testDataPath).CreateFrom<SentimentData>();
+            var evaluator = new BinaryClassificationEvaluator();
+            var metrics = evaluator.Evaluate(model, testLearningPipelineItem);
+
+            var singlePrediction = model.Predict(new SentimentData() { SentimentText = "Not big fan of this." });
+            Assert.True(singlePrediction.Sentiment);
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/Metacomponents.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/Metacomponents.cs
@@ -0,0 +1,42 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Data;
+using Microsoft.ML.Models;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Transforms;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.PipelineApi
+{
+    public partial class PipelineApiScenarioTests
+    {
+        /// <summary>
+        /// Meta-components: Meta-components (e.g., components that themselves instantiate components) should not be booby-trapped.
+        /// When specifying what trainer OVA should use, a user will be able to specify any binary classifier.
+        /// If they specify a regression or multi-class classifier ideally that should be a compile error.
+        /// </summary>
+        [Fact]
+        void Metacomponents()
+        {
+            var dataPath = GetDataPath(IrisDataPath);
+            var pipeline = new LearningPipeline(seed: 1, conc: 1);
+            pipeline.Add(new TextLoader(dataPath).CreateFrom<IrisData>(useHeader: false));
+            pipeline.Add(new Dictionarizer(new[] { "Label" }));
+            pipeline.Add(new ColumnConcatenator(outputColumn: "Features",
+                "SepalLength", "SepalWidth", "PetalLength", "PetalWidth"));
+
+            // this will throw exception during training time if you specify any other than binary classifier.
+            pipeline.Add(OneVersusAll.With(new StochasticDualCoordinateAscentBinaryClassifier()));
+
+            var model = pipeline.Train<IrisData, IrisPrediction>();
+
+            var testData = new TextLoader(dataPath).CreateFrom<IrisData>(useHeader: false);
+            var evaluator = new ClassificationEvaluator();
+            ClassificationMetrics metrics = evaluator.Evaluate(model, testData);
+
+            var prediction = model.Predict(new IrisData { PetalLength = 1, PetalWidth = 2, SepalLength = 1.4f, SepalWidth = 1.6f });
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/MultithreadedPrediction.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/MultithreadedPrediction.cs
@@ -0,0 +1,57 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Data;
+using Microsoft.ML.Runtime.Api;
+using Microsoft.ML.Runtime.Internal.Utilities;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Transforms;
+using System.Collections.Generic;
+using System.Threading;
+using System.Threading.Tasks;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.PipelineApi
+{
+    public partial class PipelineApiScenarioTests
+    {
+        /// <summary>
+        /// Multi-threaded prediction. A twist on "Simple train and predict", where we account that
+        /// multiple threads may want predictions at the same time. Because we deliberately do not
+        /// reallocate internal memory buffers on every single prediction, the PredictionEngine
+        /// (or its estimator/transformer based successor) is, like most stateful .NET objects,
+        /// fundamentally not thread safe. This is deliberate and as designed. However, some mechanism
+        /// to enable multi-threaded scenarios (e.g., a web server servicing requests) should be possible
+        /// and performant in the new API.
+        /// </summary>
+        [Fact]
+        void MultithreadedPrediction()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentDataPath);
+            var pipeline = new LearningPipeline();
+
+            pipeline.Add(new TextLoader(dataPath).CreateFrom<SentimentData>());
+
+            pipeline.Add(MakeSentimentTextTransform());
+
+            pipeline.Add(new FastTreeBinaryClassifier() { NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2 });
+
+            pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" });
+            var model = pipeline.Train<SentimentData, SentimentPrediction>();
+            var collection = new List<SentimentData>();
+            int numExamples = 100;
+            for (int i = 0; i < numExamples; i++)
+                collection.Add(new SentimentData() { SentimentText = "Let's predict this one!" });
+
+            Parallel.ForEach(collection, (input) =>
+            {
+                lock (model)
+                {
+                    var prediction = model.Predict(input);
+                }
+            });
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs
@@ -0,0 +1,63 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Api;
+using Microsoft.ML.TestFramework;
+using Xunit.Abstractions;
+
+namespace Microsoft.ML.Tests.Scenarios.PipelineApi
+{
+    public partial class PipelineApiScenarioTests : BaseTestClass
+    {
+        public PipelineApiScenarioTests(ITestOutputHelper output) : base(output)
+        {
+        }
+
+        public const string IrisDataPath = "iris.data";
+        public const string SentimentDataPath = "wikipedia-detox-250-line-data.tsv";
+        public const string SentimentTestPath = "wikipedia-detox-250-line-test.tsv";
+
+        public class IrisData : IrisDataNoLabel
+        {
+            [Column("0")]
+            public string Label;
+        }
+
+        public class IrisDataNoLabel
+        {
+            [Column("1")]
+            public float SepalLength;
+
+            [Column("2")]
+            public float SepalWidth;
+
+            [Column("3")]
+            public float PetalLength;
+
+            [Column("4")]
+            public float PetalWidth;
+        }
+
+        public class IrisPrediction
+        {
+            public float[] Score;
+        }
+
+        public class SentimentData
+        {
+            [Column("0", name: "Label")]
+            public bool Sentiment;
+            [Column("1")]
+            public string SentimentText;
+        }
+
+        public class SentimentPrediction
+        {
+            [ColumnName("PredictedLabel")]
+            public bool Sentiment;
+
+            public float Score;
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/SimpleTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/SimpleTrainAndPredict.cs
@@ -0,0 +1,56 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Data;
+using Microsoft.ML.Models;
+using Microsoft.ML.Runtime;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Transforms;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.PipelineApi
+{
+    public partial class PipelineApiScenarioTests
+    {
+        /// <summary>
+        /// Start with a dataset in a text file. Run text featurization on text values. 
+        /// Train a linear model over that. (I am thinking sentiment classification.) 
+        /// Out of the result, produce some structure over which you can get predictions programmatically 
+        /// (e.g., the prediction does not happen over a file as it did during training).
+        /// </summary>
+        [Fact]
+        void SimpleTrainAndPredict()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentDataPath);
+            var pipeline = new LearningPipeline();
+
+            pipeline.Add(new TextLoader(dataPath).CreateFrom<SentimentData>());
+
+            pipeline.Add(MakeSentimentTextTransform());
+
+            pipeline.Add(new FastTreeBinaryClassifier() { NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2 });
+
+            pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" });
+            var model = pipeline.Train<SentimentData, SentimentPrediction>();
+            var singlePrediction = model.Predict(new SentimentData() { SentimentText = "Not big fan of this." });
+            Assert.True(singlePrediction.Sentiment);
+        }
+
+        private static TextFeaturizer MakeSentimentTextTransform()
+        {
+            return new TextFeaturizer("Features", "SentimentText")
+            {
+                KeepDiacritics = false,
+                KeepPunctuations = false,
+                TextCase = TextNormalizerTransformCaseNormalizationMode.Lower,
+                OutputTokens = true,
+                StopWordsRemover = new PredefinedStopWordsRemover(),
+                VectorNormalizer = TextTransformTextNormKind.L2,
+                CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false },
+                WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true }
+            };
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/TrainSaveModelAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/TrainSaveModelAndPredict.cs
@@ -0,0 +1,42 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Data;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Transforms;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.PipelineApi
+{
+    public partial class PipelineApiScenarioTests
+    {
+        /// <summary>
+        /// Train, save/load model, predict: 
+        /// Serve the scenario where training and prediction happen in different processes (or even different machines). 
+        /// The actual test will not run in different processes, but will simulate the idea that the 
+        /// "communication pipe" is just a serialized model of some form.
+        /// </summary>
+        [Fact]
+        public async void TrainSaveModelAndPredict()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentDataPath);
+            var pipeline = new LearningPipeline();
+
+            pipeline.Add(new TextLoader(dataPath).CreateFrom<SentimentData>());
+            pipeline.Add(MakeSentimentTextTransform());
+            pipeline.Add(new FastTreeBinaryClassifier() { NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2 });
+            pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" });
+
+            var model = pipeline.Train<SentimentData, SentimentPrediction>();
+            var modelName = "trainSaveAndPredictdModel.zip";
+            DeleteOutputPath(modelName);
+            await model.WriteAsync(modelName);
+            var loadedModel = await PredictionModel.ReadAsync<SentimentData, SentimentPrediction>(modelName);
+            var singlePrediction = loadedModel.Predict(new SentimentData() { SentimentText = "Not big fan of this." });
+            Assert.True(singlePrediction.Sentiment);
+
+        }
+    }
+}