From 8d8f1b8b085aeae335a39dd27760843aebb43aa7 Mon Sep 17 00:00:00 2001 From: "Harish S. Kulkarni" Date: Wed, 18 Dec 2019 10:42:17 -0800 Subject: [PATCH 1/5] Added onnx export support for SelectColumns --- .../Transforms/ColumnSelecting.cs | 29 +- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 55 + .../OnnxConversionTest.cs.orig | 1560 +++++++++++++++++ 3 files changed, 1638 insertions(+), 6 deletions(-) create mode 100644 test/Microsoft.ML.Tests/OnnxConversionTest.cs.orig diff --git a/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs b/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs index 346025ca94..d2cd9b0e9d 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs @@ -9,6 +9,7 @@ using Microsoft.ML.CommandLine; using Microsoft.ML.Data; using Microsoft.ML.Internal.Utilities; +using Microsoft.ML.Model.OnnxConverter; using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; @@ -520,7 +521,7 @@ private sealed class Mapper { private readonly IHost _host; private readonly DataViewSchema _inputSchema; - private readonly int[] _outputToInputMap; + public readonly int[] OutputToInputMap; public DataViewSchema InputSchema => _inputSchema; @@ -531,17 +532,17 @@ public Mapper(ColumnSelectingTransformer transform, DataViewSchema inputSchema) _host = transform._host.Register(nameof(Mapper)); _inputSchema = inputSchema; - _outputToInputMap = BuildOutputToInputMap(transform.SelectColumns, + OutputToInputMap = BuildOutputToInputMap(transform.SelectColumns, transform.KeepColumns, transform.KeepHidden, _inputSchema); - OutputSchema = GenerateOutputSchema(_outputToInputMap, _inputSchema); + OutputSchema = GenerateOutputSchema(OutputToInputMap, _inputSchema); } public int GetInputIndex(int outputIndex) { - _host.Assert(0 <= outputIndex && outputIndex < _outputToInputMap.Length); - return _outputToInputMap[outputIndex]; + _host.Assert(0 <= outputIndex && outputIndex < OutputToInputMap.Length); + return OutputToInputMap[outputIndex]; } private static int[] BuildOutputToInputMap(IEnumerable selectedColumns, @@ -648,7 +649,7 @@ public override ValueGetter GetGetter(DataViewSchema.Column colu public override bool IsColumnActive(DataViewSchema.Column column) => true; } - private sealed class SelectColumnsDataTransform : IDataTransform, IRowToRowMapper, ITransformTemplate + private sealed class SelectColumnsDataTransform : IDataTransform, IRowToRowMapper, ITransformTemplate, ITransformCanSaveOnnx { private readonly IHost _host; private readonly ColumnSelectingTransformer _transform; @@ -725,6 +726,22 @@ DataViewRow IRowToRowMapper.GetRow(DataViewRow input, IEnumerable new SelectColumnsDataTransform(env, _transform, new Mapper(_transform, newSource.Schema), newSource); + + public bool CanSaveOnnx(OnnxContext ctx) => true; + + public void SaveAsOnnx(OnnxContext ctx) + { + var outputToInputMap = _mapper.OutputToInputMap; + for(int i = 0; i < outputToInputMap.Length; i++) + { + var srcCol = InputSchema[outputToInputMap[i]]; + var dstCol = OutputSchema[i]; + var srcVariable = ctx.GetVariableName(srcCol.Name); + var dstVariable = ctx.AddIntermediateVariable(dstCol.Type, dstCol.Name, true); + string opType = "Identity"; + ctx.CreateNode(opType, srcVariable, dstVariable, ctx.GetNodeName(opType), ""); + } + } } private sealed class Cursor : SynchronizedCursorBase diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 0ab47d8a27..5bd0db013a 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1307,6 +1307,61 @@ public void FeatureSelectionOnnxTest() } + + [Fact] + public void SelectColumnsOnnxTest() + { + var mlContext = new MLContext(seed: 1); + + string dataPath = GetDataPath("breast-cancer.txt"); + + var dataView = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("Label", DataKind.Boolean, 0), + new TextLoader.Column("Thickness", DataKind.Int32, 1), + new TextLoader.Column("Size", DataKind.Int32, 2), + new TextLoader.Column("Shape", DataKind.Int32, 3), + new TextLoader.Column("Adhesion", DataKind.Int32, 4), + new TextLoader.Column("EpithelialSize", DataKind.Int32, 5), + new TextLoader.Column("BareNuclei", DataKind.Single, 6), + new TextLoader.Column("BlandChromatin", DataKind.Int32, 7), + new TextLoader.Column("NormalNucleoli", DataKind.Int32, 8), + new TextLoader.Column("Mitoses", DataKind.Int32, 9), + }); + + var pipeline = mlContext.Transforms.ReplaceMissingValues("BareNuclei") + .Append(mlContext.Transforms.SelectColumns(new[] { "Size", "Shape", "Thickness", "Label" })); + + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + + var onnxFileName = "selectcolumns.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (IsOnnxRuntimeSupported()) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + + Assert.Equal("Size1", outputNames[0]); + Assert.Equal("Shape1", outputNames[1]); + Assert.Equal("Thickness1", outputNames[2]); + Assert.Equal("Label1", outputNames[3]); + + CompareSelectedScalarColumns("Size", "Size1", transformedData, onnxResult); + CompareSelectedScalarColumns("Shape", "Shape1", transformedData, onnxResult); + CompareSelectedScalarColumns("Thickness", "Thickness1", transformedData, onnxResult); + CompareSelectedScalarColumns("Label", "Label1", transformedData, onnxResult); + } + Done(); + } + private void CompareResults(string leftColumnName, string rightColumnName, IDataView left, IDataView right) { var leftColumn = left.Schema[leftColumnName]; diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs.orig b/test/Microsoft.ML.Tests/OnnxConversionTest.cs.orig new file mode 100644 index 0000000000..b944d047e6 --- /dev/null +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs.orig @@ -0,0 +1,1560 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.InteropServices; +using System.Text.RegularExpressions; +using Google.Protobuf; +using Microsoft.ML.Data; +using Microsoft.ML.EntryPoints; +using Microsoft.ML.Model.OnnxConverter; +using Microsoft.ML.RunTests; +using Microsoft.ML.Runtime; +using Microsoft.ML.TestFramework.Attributes; +using Microsoft.ML.TestFrameworkCommon; +using Microsoft.ML.TestFrameworkCommon.Attributes; +using Microsoft.ML.Tools; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.LightGbm; +using Microsoft.ML.Transforms; +using Microsoft.ML.Transforms.Onnx; +using Microsoft.ML.Transforms.Text; +using Newtonsoft.Json; +using Xunit; +using Xunit.Abstractions; +using static Microsoft.ML.Model.OnnxConverter.OnnxCSharpToProtoWrapper; + +#pragma warning disable CS0649 // Field 'fieldName' is never assigned to, and will always have its default value null + +namespace Microsoft.ML.Tests +{ + public class OnnxConversionTest : BaseTestBaseline + { + private class AdultData + { + [LoadColumn(0, 10), ColumnName("FeatureVector")] + public float Features { get; set; } + + [LoadColumn(11)] + public float Target { get; set; } + } + + public OnnxConversionTest(ITestOutputHelper output) : base(output) + { + } + + private bool IsOnnxRuntimeSupported() + { + return OnnxFactAttribute.IsOnnxRuntimeSupported; + } + + /// + /// In this test, we convert a trained into ONNX file and then + /// call to evaluate that file. The outputs of are checked against the original + /// ML.NET model's outputs. + /// + [Fact] + public void SimpleEndToEndOnnxConversionTest() + { + // Step 1: Create and train a ML.NET pipeline. + var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); + var mlContext = new MLContext(seed: 1); + var data = mlContext.Data.LoadFromTextFile(trainDataPath, + separatorChar: ';' +, + hasHeader: true); + var cachedTrainData = mlContext.Data.Cache(data); + var dynamicPipeline = + mlContext.Transforms.NormalizeMinMax("FeatureVector") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() { + LabelColumnName = "Target", + FeatureColumnName = "FeatureVector", + NumberOfThreads = 1 + })); + var model = dynamicPipeline.Fit(data); + var transformedData = model.Transform(data); + + // Step 2: Convert ML.NET model to ONNX format and save it as a file. + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + var onnxFileName = "model.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + // Step 3: Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(data); + var onnxResult = onnxTransformer.Transform(data); + + // Step 4: Compare ONNX and ML.NET results. + CompareSelectedR4ScalarColumns("Score", "Score0", transformedData, onnxResult, 1); + } + + // Step 5: Check ONNX model's text format. This test will be not necessary if Step 3 and Step 4 can run on Linux and + // Mac to support cross-platform tests. + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Regression", "Adult"); + var onnxTextName = "SimplePipeline.txt"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + SaveOnnxModel(onnxModel, null, onnxTextPath); + CheckEquality(subDir, onnxTextName, digitsOfPrecision: 3); + + Done(); + } + + private class BreastCancerFeatureVector + { + [LoadColumn(1, 9), VectorType(9)] + public float[] Features; + } + + private class BreastCancerCatFeatureExample + { + [LoadColumn(0)] + public bool Label; + + [LoadColumn(1)] + public float F1; + + [LoadColumn(2)] + public string F2; + } + + private class BreastCancerMulticlassExample + { + [LoadColumn(1)] + public string Label; + + [LoadColumn(2, 9), VectorType(8)] + public float[] Features; + } + + private class BreastCancerBinaryClassification + { + [LoadColumn(0)] + public bool Label; + + [LoadColumn(2, 9), VectorType(8)] + public float[] Features; + } + + [LessThanNetCore30OrNotNetCoreFact("netcoreapp3.0 output differs from Baseline. Tracked by https://github.com/dotnet/machinelearning/issues/2087")] + public void KmeansOnnxConversionTest() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(seed: 1); + + string dataPath = GetDataPath("breast-cancer.txt"); + // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var data = mlContext.Data.LoadFromTextFile(dataPath, + separatorChar: '\t', + hasHeader: true); + + var pipeline = mlContext.Transforms.NormalizeMinMax("Features"). + Append(mlContext.Clustering.Trainers.KMeans(new Trainers.KMeansTrainer.Options + { + FeatureColumnName = DefaultColumnNames.Features, + MaximumNumberOfIterations = 1, + NumberOfClusters = 4, + NumberOfThreads = 1, + InitializationAlgorithm = Trainers.KMeansTrainer.InitializationAlgorithm.Random + })); + + var model = pipeline.Fit(data); + var transformedData = model.Transform(data); + + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + + // Compare results produced by ML.NET and ONNX's runtime. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + var onnxFileName = "model.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(data); + var onnxResult = onnxTransformer.Transform(data); + CompareSelectedR4VectorColumns("Score", "Score0", transformedData, onnxResult, 3); + } + + // Check ONNX model's text format. We save the produced ONNX model as a text file and compare it against + // the associated file in ML.NET repo. Such a comparison can be retired if ONNXRuntime ported to ML.NET + // can support Linux and Mac. + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Cluster", "BreastCancer"); + var onnxTextName = "Kmeans.txt"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + SaveOnnxModel(onnxModel, null, onnxTextPath); + CheckEquality(subDir, onnxTextName, digitsOfPrecision: 2); + Done(); + } + + [Fact] + public void RegressionTrainersOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + string dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); + + // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var dataView = mlContext.Data.LoadFromTextFile(dataPath, + separatorChar: ';', + hasHeader: true); + List> estimators = new List>() + { + mlContext.Regression.Trainers.Sdca("Target","FeatureVector"), + mlContext.Regression.Trainers.Ols("Target","FeatureVector"), + mlContext.Regression.Trainers.OnlineGradientDescent("Target","FeatureVector"), + mlContext.Regression.Trainers.FastForest("Target", "FeatureVector"), + mlContext.Regression.Trainers.FastTree("Target", "FeatureVector"), + mlContext.Regression.Trainers.FastTreeTweedie("Target", "FeatureVector"), + mlContext.Regression.Trainers.LbfgsPoissonRegression("Target", "FeatureVector"), + }; + if (Environment.Is64BitProcess) + { + estimators.Add(mlContext.Regression.Trainers.LightGbm("Target", "FeatureVector")); + } + foreach (var estimator in estimators) + { + var model = estimator.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + // Compare model scores produced by ML.NET and ONNX's runtime + if (IsOnnxRuntimeSupported()) + { + var onnxFileName = $"{estimator.ToString()}.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4ScalarColumns(transformedData.Schema[2].Name, outputNames[2], transformedData, onnxResult, 3); // compare score results + } + // Compare the Onnx graph to a baseline if OnnxRuntime is not supported + else + { + var onnxFileName = $"{estimator.ToString()}.txt"; + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Regression", "Adult"); + var onnxTextModelPath = GetOutputPath(subDir, onnxFileName); + SaveOnnxModel(onnxModel, null, onnxTextModelPath); + CheckEquality(subDir, onnxFileName, digitsOfPrecision: 1); + } + } + Done(); + } + + [Fact] + public void BinaryClassificationTrainersOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + string dataPath = GetDataPath("breast-cancer.txt"); + // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var dataView = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: '\t', hasHeader: true); + List> estimators = new List>() + { + mlContext.BinaryClassification.Trainers.AveragedPerceptron(), + mlContext.BinaryClassification.Trainers.FastForest(), + mlContext.BinaryClassification.Trainers.FastTree(), + mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(), + mlContext.BinaryClassification.Trainers.LinearSvm(), + mlContext.BinaryClassification.Trainers.Prior(), + mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(), + mlContext.BinaryClassification.Trainers.SdcaNonCalibrated(), + mlContext.BinaryClassification.Trainers.SgdCalibrated(), + mlContext.BinaryClassification.Trainers.SgdNonCalibrated(), + mlContext.BinaryClassification.Trainers.SymbolicSgdLogisticRegression(), + }; + if (Environment.Is64BitProcess) + { + estimators.Add(mlContext.BinaryClassification.Trainers.LightGbm()); + } + + var initialPipeline = mlContext.Transforms.ReplaceMissingValues("Features"). + Append(mlContext.Transforms.NormalizeMinMax("Features")); + foreach (var estimator in estimators) + { + var pipeline = initialPipeline.Append(estimator); + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + // Compare model scores produced by ML.NET and ONNX's runtime. + if (IsOnnxRuntimeSupported()) + { + var onnxFileName = $"{estimator.ToString()}.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4ScalarColumns(transformedData.Schema[5].Name, outputNames[3], transformedData, onnxResult, 3); //compare scores + CompareSelectedScalarColumns(transformedData.Schema[4].Name, outputNames[2], transformedData, onnxResult); //compare predicted labels + } + } + Done(); + } + + private class DataPoint + { + [VectorType(3)] + public float[] Features { get; set; } + } + + [Theory] + [CombinatorialData] + public void LpNormOnnxConversionTest( + bool ensureZeroMean, + LpNormNormalizingEstimatorBase.NormFunction norm) + { + var mlContext = new MLContext(seed: 1); + + var samples = new List() + { + new DataPoint() { Features = new float[3] {0.01f, 0.02f, 0.03f} }, + new DataPoint() { Features = new float[3] {0.04f, 0.05f, 0.06f} }, + new DataPoint() { Features = new float[3] {0.07f, 0.08f, 0.09f} }, + new DataPoint() { Features = new float[3] {0.10f, 0.11f, 0.12f} }, + new DataPoint() { Features = new float[3] {0.13f, 0.14f, 0.15f} } + }; + var dataView = mlContext.Data.LoadFromEnumerable(samples); + + var pipe = mlContext.Transforms.NormalizeLpNorm(nameof(DataPoint.Features), norm:norm, ensureZeroMean: ensureZeroMean); + + var model = pipe.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + + var onnxFileName = $"LpNorm-{norm.ToString()}-{ensureZeroMean}.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, null); + + // Compare results produced by ML.NET and ONNX's runtime. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4VectorColumns(nameof(DataPoint.Features), outputNames[0], transformedData, onnxResult, 3); + } + + Done(); + } + + [Fact] + public void CommandLineOnnxConversionTest() + { + string dataPath = GetDataPath("breast-cancer.txt"); + string modelPath = GetOutputPath("ModelWithLessIO.zip"); + var trainingPathArgs = $"data={dataPath} out={modelPath}"; + var trainingArgs = " loader=text{col=Label:BL:0 col=F1:R4:1-8 col=F2:TX:9} xf=Cat{col=F2} xf=Concat{col=Features:F1,F2} tr=ft{numberOfThreads=1 numberOfLeaves=8 numberOfTrees=3} seed=1"; + Assert.Equal(0, Maml.Main(new[] { "train " + trainingPathArgs + trainingArgs })); + + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); + var onnxTextName = "ModelWithLessIO.txt"; + var onnxFileName = "ModelWithLessIO.onnx"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + var onnxFilePath = GetOutputPath(subDir, onnxFileName); + string conversionCommand = $"saveonnx in={modelPath} onnx={onnxFilePath} json={onnxTextPath} domain=machinelearning.dotnet name=modelWithLessIO inputsToDrop=Label outputsToDrop=F1,F2,Features,Label"; + Assert.Equal(0, Maml.Main(new[] { conversionCommand })); + + var fileText = File.ReadAllText(onnxTextPath); + fileText = Regex.Replace(fileText, "\"producerVersion\": \".*\"", "\"producerVersion\": \"##VERSION##\""); + File.WriteAllText(onnxTextPath, fileText); + + CheckEquality(subDir, onnxTextName); + Done(); + } + + [Fact] + public void KeyToVectorWithBagOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + + string dataPath = GetDataPath("breast-cancer.txt"); + + var data = mlContext.Data.LoadFromTextFile(dataPath, + separatorChar: '\t', + hasHeader: true); + + var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", Transforms.OneHotEncodingEstimator.OutputKind.Bag) + .Append(mlContext.Transforms.ReplaceMissingValues(new MissingValueReplacingEstimator.ColumnOptions("F2"))) + .Append(mlContext.Transforms.Concatenate("Features", "F1", "F2")) + .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); + + var model = pipeline.Fit(data); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + + // Check ONNX model's text format. We save the produced ONNX model as a text file and compare it against + // the associated file in ML.NET repo. Such a comparison can be retired if ONNXRuntime ported to ML.NET + // can support Linux and Mac. + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); + var onnxTextName = "OneHotBagPipeline.txt"; + var onnxFileName = "OneHotBagPipeline.onnx"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + var onnxFilePath = GetOutputPath(subDir, onnxFileName); + SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); + CheckEquality(subDir, onnxTextName); + Done(); + } + + [Fact] + public void InitializerCreationTest() + { + var env = new MLContext(); + // Create the actual implementation + var ctxImpl = new OnnxContextImpl(env, "model", "ML.NET", "0", 0, "com.test", Model.OnnxConverter.OnnxVersion.Stable); + + // Use implementation as in the actual conversion code + var ctx = ctxImpl as OnnxContext; + ctx.AddInitializer(9.4f, "float"); + ctx.AddInitializer(17L, "int64"); + ctx.AddInitializer("36", "string"); + ctx.AddInitializer(new List { 9.4f, 1.7f, 3.6f }, new List { 1, 3 }, "floats"); + ctx.AddInitializer(new List { 94L, 17L, 36L }, new List { 1, 3 }, "int64s"); + ctx.AddInitializer(new List { "94", "17", "36" }, new List { 1, 3 }, "strings"); + + var model = ctxImpl.MakeModel(); + + var floatScalar = model.Graph.Initializer[0]; + Assert.True(floatScalar.Name == "float"); + Assert.True(floatScalar.Dims.Count == 0); + Assert.True(floatScalar.FloatData.Count == 1); + Assert.True(floatScalar.FloatData[0] == 9.4f); + + var int64Scalar = model.Graph.Initializer[1]; + Assert.True(int64Scalar.Name == "int64"); + Assert.True(int64Scalar.Dims.Count == 0); + Assert.True(int64Scalar.Int64Data.Count == 1); + Assert.True(int64Scalar.Int64Data[0] == 17L); + + var stringScalar = model.Graph.Initializer[2]; + Assert.True(stringScalar.Name == "string"); + Assert.True(stringScalar.Dims.Count == 0); + Assert.True(stringScalar.StringData.Count == 1); + Assert.True(stringScalar.StringData[0].ToStringUtf8() == "36"); + + var floatsTensor = model.Graph.Initializer[3]; + Assert.True(floatsTensor.Name == "floats"); + Assert.True(floatsTensor.Dims.Count == 2); + Assert.True(floatsTensor.Dims[0] == 1); + Assert.True(floatsTensor.Dims[1] == 3); + Assert.True(floatsTensor.FloatData.Count == 3); + Assert.True(floatsTensor.FloatData[0] == 9.4f); + Assert.True(floatsTensor.FloatData[1] == 1.7f); + Assert.True(floatsTensor.FloatData[2] == 3.6f); + + var int64sTensor = model.Graph.Initializer[4]; + Assert.True(int64sTensor.Name == "int64s"); + Assert.True(int64sTensor.Dims.Count == 2); + Assert.True(int64sTensor.Dims[0] == 1); + Assert.True(int64sTensor.Dims[1] == 3); + Assert.True(int64sTensor.Int64Data.Count == 3); + Assert.True(int64sTensor.Int64Data[0] == 94L); + Assert.True(int64sTensor.Int64Data[1] == 17L); + Assert.True(int64sTensor.Int64Data[2] == 36L); + + var stringsTensor = model.Graph.Initializer[5]; + Assert.True(stringsTensor.Name == "strings"); + Assert.True(stringsTensor.Dims.Count == 2); + Assert.True(stringsTensor.Dims[0] == 1); + Assert.True(stringsTensor.Dims[1] == 3); + Assert.True(stringsTensor.StringData.Count == 3); + Assert.True(stringsTensor.StringData[0].ToStringUtf8() == "94"); + Assert.True(stringsTensor.StringData[1].ToStringUtf8() == "17"); + Assert.True(stringsTensor.StringData[2].ToStringUtf8() == "36"); + } + + [Fact] + public void LogisticRegressionOnnxConversionTest() + { + // Step 1: Create and train a ML.NET pipeline. + var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); + var mlContext = new MLContext(seed: 1); + var data = mlContext.Data.LoadFromTextFile(trainDataPath, + separatorChar: ';' +, + hasHeader: true); + var cachedTrainData = mlContext.Data.Cache(data); + var dynamicPipeline = + mlContext.Transforms.NormalizeMinMax("FeatureVector") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() { + LabelColumnName = "Target", + FeatureColumnName = "FeatureVector", + NumberOfThreads = 1 + })); + var model = dynamicPipeline.Fit(data); + + // Step 2: Convert ML.NET model to ONNX format and save it as a file. + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + + // Step 3: Save ONNX model as binary and text files. + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); + var onnxFileName = "LogisticRegressionSaveModelToOnnxTest.onnx"; + var onnxFilePath = GetOutputPath(subDir, onnxFileName); + var onnxTextName = "LogisticRegressionSaveModelToOnnxTest.txt"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); + + // Step 4: Check ONNX model's text format. + CheckEquality(subDir, onnxTextName, digitsOfPrecision: 3); + Done(); + } + + [LightGBMFact] + public void LightGbmBinaryClassificationOnnxConversionTest() + { + // Step 1: Create and train a ML.NET pipeline. + var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); + var mlContext = new MLContext(seed: 1); + var data = mlContext.Data.LoadFromTextFile(trainDataPath, + separatorChar: ';' +, + hasHeader: true); + var cachedTrainData = mlContext.Data.Cache(data); + var dynamicPipeline = + mlContext.Transforms.NormalizeMinMax("FeatureVector") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.Regression.Trainers.LightGbm(labelColumnName: "Target", featureColumnName: "FeatureVector", numberOfIterations: 3, numberOfLeaves: 16, minimumExampleCountPerLeaf: 100)); + var model = dynamicPipeline.Fit(data); + + // Step 2: Convert ML.NET model to ONNX format and save it as a file. + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + + // Step 3: Save ONNX model as binary and text files. + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); + var onnxFileName = "LightGbmBinaryClassificationOnnxConversionTest.onnx"; + var onnxFilePath = GetOutputPath(subDir, onnxFileName); + var onnxTextName = "LightGbmBinaryClassificationOnnxConversionTest.txt"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); + + // Step 4: Check ONNX model's text format. + CheckEquality(subDir, onnxTextName, digitsOfPrecision: 3); + Done(); + } + + [Fact] + public void MulticlassLogisticRegressionOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + + string dataPath = GetDataPath("breast-cancer.txt"); + var data = mlContext.Data.LoadFromTextFile(dataPath, + separatorChar: '\t', + hasHeader: true); + + var pipeline = mlContext.Transforms.NormalizeMinMax("Features"). + Append(mlContext.Transforms.Conversion.MapValueToKey("Label")). + Append(mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(new LbfgsMaximumEntropyMulticlassTrainer.Options() { NumberOfThreads = 1 })); + + var model = pipeline.Fit(data); + var transformedData = model.Transform(data); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "MultiClassClassification", "BreastCancer"); + var onnxFileName = "MultiClassificationLogisticRegressionSaveModelToOnnxTest.onnx"; + var onnxFilePath = GetOutputPath(subDir, onnxFileName); + var onnxTextName = "MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + + SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); + + CheckEquality(subDir, onnxTextName, digitsOfPrecision: 2); + Done(); + } + + [Fact] + public void LoadingPredictorModelAndOnnxConversionTest() + { + string dataPath = GetDataPath("iris.txt"); + string modelPath = Path.GetTempPath() + Guid.NewGuid().ToString() + ".model.bin"; + string onnxPath = Path.GetTempPath() + Guid.NewGuid().ToString() + ".model.onnx"; + string onnxJsonPath = Path.GetTempPath() + Guid.NewGuid().ToString() + ".model.onnx.json"; + + string inputGraph = string.Format(@" + {{ + 'Inputs': {{ + 'inputFile': '{0}' + }}, + 'Nodes': [ + {{ + 'Name': 'Data.TextLoader', + 'Inputs': + {{ + 'InputFile': '$inputFile', + 'Arguments': + {{ + 'UseThreads': true, + 'HeaderFile': null, + 'MaxRows': null, + 'AllowQuoting': true, + 'AllowSparse': true, + 'InputSize': null, + 'TrimWhitespace': false, + 'HasHeader': false, + 'Column': + [ + {{'Name':'Sepal_Width','Type':null,'Source':[{{'Min':2,'Max':2,'AutoEnd':false,'VariableEnd':false,'AllOther':false,'ForceVector':false}}],'KeyCount':null}}, + {{'Name':'Petal_Length','Type':null,'Source':[{{'Min':3,'Max':4,'AutoEnd':false,'VariableEnd':false,'AllOther':false,'ForceVector':false}}],'KeyCount':null}}, + ] + }} + }}, + 'Outputs': + {{ + 'Data': '$training_data' + }} + }}, + {{ + 'Inputs': {{ + 'FeatureColumnName': 'Petal_Length', + 'LabelColumnName': 'Sepal_Width', + 'TrainingData': '$training_data', + }}, + 'Name': 'Trainers.StochasticDualCoordinateAscentRegressor', + 'Outputs': {{ + 'PredictorModel': '$output_model' + }} + }} + ], + 'Outputs': {{ + 'output_model': '{1}' + }} + }}", dataPath.Replace("\\", "\\\\"), modelPath.Replace("\\", "\\\\")); + + // Write entry point graph into file so that it can be invoke by graph runner below. + var jsonPath = DeleteOutputPath("graph.json"); + File.WriteAllLines(jsonPath, new[] { inputGraph }); + + // Execute the saved entry point graph to produce a predictive model. + var args = new ExecuteGraphCommand.Arguments() { GraphPath = jsonPath }; + var cmd = new ExecuteGraphCommand(Env, args); + cmd.Run(); + + // Make entry point graph to conduct ONNX conversion. + inputGraph = string.Format(@" + {{ + 'Inputs': {{ + 'model': '{0}' + }}, + 'Nodes': [ + {{ + 'Inputs': {{ + 'Domain': 'com.microsoft.models', + 'Json': '{1}', + 'PredictiveModel': '$model', + 'Onnx': '{2}', + 'OnnxVersion': 'Experimental' + }}, + 'Name': 'Models.OnnxConverter', + 'Outputs': {{}} + }} + ], + 'Outputs': {{}} + }} + ", modelPath.Replace("\\", "\\\\"), onnxJsonPath.Replace("\\", "\\\\"), onnxPath.Replace("\\", "\\\\")); + + // Write entry point graph for ONNX conversion into file so that it can be invoke by graph runner below. + jsonPath = DeleteOutputPath("graph.json"); + File.WriteAllLines(jsonPath, new[] { inputGraph }); + + // Onnx converter's assembly is not loaded by default, so we need to register it before calling it. + Env.ComponentCatalog.RegisterAssembly(typeof(OnnxExportExtensions).Assembly); + + // Execute the saved entry point graph to convert the saved model to ONNX format. + args = new ExecuteGraphCommand.Arguments() { GraphPath = jsonPath }; + cmd = new ExecuteGraphCommand(Env, args); + cmd.Run(); + + // Load the resulted ONNX model from the file so that we can check if the conversion looks good. + var model = new OnnxCSharpToProtoWrapper.ModelProto(); + using (var modelStream = File.OpenRead(onnxPath)) + model = OnnxCSharpToProtoWrapper.ModelProto.Parser.ParseFrom(modelStream); + + // Make sure a PredictorModel is loaded by seeing if a predictive model exists. In this the + // predictive model is "LinearRegressor" (converted from StochasticDualCoordinateAscentRegressor + // in the original training entry-point graph. + Assert.Equal("Scaler", model.Graph.Node[0].OpType); + Assert.Equal("LinearRegressor", model.Graph.Node[1].OpType); + + File.Delete(modelPath); + File.Delete(onnxPath); + File.Delete(onnxJsonPath); + + Done(); + } + + + [Fact] + public void RemoveVariablesInPipelineTest() + { + var mlContext = new MLContext(seed: 1); + + string dataPath = GetDataPath("breast-cancer.txt"); + var data = mlContext.Data.LoadFromTextFile(dataPath, + separatorChar: '\t', + hasHeader: true); + + var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", Transforms.OneHotEncodingEstimator.OutputKind.Bag) + .Append(mlContext.Transforms.ReplaceMissingValues(new MissingValueReplacingEstimator.ColumnOptions("F2"))) + .Append(mlContext.Transforms.Concatenate("Features", "F1", "F2")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); + + var model = pipeline.Fit(data); + var transformedData = model.Transform(data); + + var onnxConversionContext = new OnnxContextImpl(mlContext, "A Simple Pipeline", "ML.NET", "0", 0, "machinelearning.dotnet", OnnxVersion.Stable); + + LinkedList transforms = null; + using (var conversionChannel = (mlContext as IChannelProvider).Start("ONNX conversion")) + { + SaveOnnxCommand.GetPipe(onnxConversionContext, conversionChannel, transformedData, out IDataView root, out IDataView sink, out transforms); + // Input columns' names to be excluded in the resulted ONNX model. + var redundantInputColumnNames = new HashSet { "Label" }; + // Output columns' names to be excluded in the resulted ONNX model. + var redundantOutputColumnNames = new HashSet { "Label", "F1", "F2", "Features" }; + var onnxModel = SaveOnnxCommand.ConvertTransformListToOnnxModel(onnxConversionContext, conversionChannel, root, sink, transforms, + redundantInputColumnNames, redundantOutputColumnNames); + + // Check ONNX model's text format. We save the produced ONNX model as a text file and compare it against + // the associated file in ML.NET repo. Such a comparison can be retired if ONNXRuntime ported to ML.NET + // can support Linux and Mac. + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); + var onnxTextName = "ExcludeVariablesInOnnxConversion.txt"; + var onnxFileName = "ExcludeVariablesInOnnxConversion.onnx"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + var onnxFilePath = GetOutputPath(subDir, onnxFileName); + SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); + CheckEquality(subDir, onnxTextName, digitsOfPrecision: 3); + } + Done(); + } + + private class SmallSentimentExample + { + [LoadColumn(0, 3), VectorType(4)] + public string[] Tokens; + } + + [Fact] + public void WordEmbeddingsTest() + { + var mlContext = new MLContext(seed: 1); + var dataPath = GetDataPath(@"small-sentiment-test.tsv"); + var embedNetworkPath = GetDataPath(@"shortsentiment.emd"); + var data = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: '\t', hasHeader: false); + + var pipeline = mlContext.Transforms.Text.ApplyWordEmbedding("Embed", embedNetworkPath, "Tokens"); + var model = pipeline.Fit(data); + var transformedData = model.Transform(data); + + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Transforms", "Sentiment"); + var onnxTextName = "SmallWordEmbed.txt"; + var onnxFileName = "SmallWordEmbed.onnx"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + var onnxFilePath = GetOutputPath(subDir, onnxFileName); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); + + CheckEquality(subDir, onnxTextName, parseOption: NumberParseOption.UseSingle); + Done(); + } + + [Theory] + // These are the supported conversions + // ML.NET does not allow any conversions between signed and unsigned numeric types + // Onnx does not seem to support casting a string to any type + // Though the onnx docs claim support for byte and sbyte, + // CreateNamedOnnxValue in OnnxUtils.cs throws a NotImplementedException for those two + [InlineData(DataKind.Int16, DataKind.Int16)] + [InlineData(DataKind.Int16, DataKind.Int32)] + [InlineData(DataKind.Int16, DataKind.Int64)] + [InlineData(DataKind.Int16, DataKind.Single)] + [InlineData(DataKind.Int16, DataKind.Double)] + [InlineData(DataKind.UInt16, DataKind.UInt16)] + [InlineData(DataKind.UInt16, DataKind.UInt32)] + [InlineData(DataKind.UInt16, DataKind.UInt64)] + [InlineData(DataKind.UInt16, DataKind.Single)] + [InlineData(DataKind.UInt16, DataKind.Double)] + [InlineData(DataKind.Int32, DataKind.Int16)] + [InlineData(DataKind.Int32, DataKind.Int32)] + [InlineData(DataKind.Int32, DataKind.Int64)] + [InlineData(DataKind.Int32, DataKind.Single)] + [InlineData(DataKind.Int32, DataKind.Double)] + [InlineData(DataKind.Int64, DataKind.Int16)] + [InlineData(DataKind.Int64, DataKind.Int32)] + [InlineData(DataKind.Int64, DataKind.Int64)] + [InlineData(DataKind.Int64, DataKind.Single)] + [InlineData(DataKind.Int64, DataKind.Double)] + [InlineData(DataKind.UInt64, DataKind.UInt16)] + [InlineData(DataKind.UInt64, DataKind.UInt32)] + [InlineData(DataKind.UInt64, DataKind.UInt64)] + [InlineData(DataKind.UInt64, DataKind.Single)] + [InlineData(DataKind.UInt64, DataKind.Double)] + [InlineData(DataKind.Single, DataKind.Single)] + [InlineData(DataKind.Single, DataKind.Double)] + [InlineData(DataKind.Double, DataKind.Single)] + [InlineData(DataKind.Double, DataKind.Double)] + public void OnnxTypeConversionTest(DataKind fromKind, DataKind toKind) + { + var mlContext = new MLContext(seed: 1); + string filePath = GetDataPath("type-conversion.txt"); + + TextLoader.Column[] columns = new [] + { + new TextLoader.Column("Value", fromKind, 0, 0) + }; + var dataView = mlContext.Data.LoadFromTextFile(filePath, columns); + + var pipeline = mlContext.Transforms.Conversion.ConvertType("ValueConverted", "Value", outputKind: toKind); + var model = pipeline.Fit(dataView); + var mlnetResult = model.Transform(dataView); + + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + var onnxFileName = "typeconversion.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + + CompareResults(model.ColumnPairs[0].outputColumnName, outputNames[1], mlnetResult, onnxResult); + } + + Done(); + } + + [Fact] + public void PcaOnnxConversionTest() + { + var dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); + + var mlContext = new MLContext(seed: 1); + var dataView = mlContext.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.Single, 0, 10) + }, hasHeader: true, separatorChar: ';'); + + bool[] zeroMeans = { true, false }; + foreach (var zeroMean in zeroMeans) + { + var pipeline = ML.Transforms.ProjectToPrincipalComponents("pca", "features", rank: 5, seed: 1, ensureZeroMean: zeroMean); + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + + var onnxFileName = "pca.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4VectorColumns(model.ColumnPairs[0].outputColumnName, outputNames[2], transformedData, onnxResult); + } + } + + Done(); + } + + private class TransformedDataPoint : DataPoint, IEquatable + { + [VectorType(3)] + public int[] MissingIndicator { get; set; } + + public bool Equals(TransformedDataPoint other) + { + return Enumerable.SequenceEqual(MissingIndicator, other.MissingIndicator); + } + } + + [Fact] + public void IndicateMissingValuesOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + + var samples = new List() + { + new DataPoint() { Features = new float[3] {1, 1, 0}, }, + new DataPoint() { Features = new float[3] {0, float.NaN, 1}, }, + new DataPoint() { Features = new float[3] {-1, float.NaN, float.PositiveInfinity}, }, + }; + var dataView = mlContext.Data.LoadFromEnumerable(samples); + + // IsNaN outputs a binary tensor. Support for this has been added in the latest version + // of Onnxruntime, but that hasn't been released yet. + // So we need to convert its type to Int32 until then. + // ConvertType part of the pipeline can be removed once we pick up a new release of the Onnx runtime + + var pipeline = mlContext.Transforms.IndicateMissingValues(new[] { new InputOutputColumnPair("MissingIndicator", "Features"), }) + .Append(mlContext.Transforms.Conversion.ConvertType("MissingIndicator", outputKind: DataKind.Int32)); + + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var mlnetData = mlContext.Data.CreateEnumerable(transformedData, false); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Transforms"); + var onnxFileName = "IndicateMissingValues.onnx"; + var onnxTextName = "IndicateMissingValues.txt"; + var onnxModelPath = GetOutputPath(onnxFileName); + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + + SaveOnnxModel(onnxModel, onnxModelPath, onnxTextPath); + + // Compare results produced by ML.NET and ONNX's runtime. + if (IsOnnxRuntimeSupported()) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedVectorColumns(model.LastTransformer.ColumnPairs[0].outputColumnName, outputNames[1], transformedData, onnxResult); + } + + CheckEquality(subDir, onnxTextName, parseOption: NumberParseOption.UseSingle); + Done(); + } + + [Theory] + [InlineData(DataKind.Single)] + [InlineData(DataKind.String)] + public void ValueToKeyMappingOnnxConversionTest(DataKind valueType) + { + var mlContext = new MLContext(seed: 1); + string filePath = GetDataPath("type-conversion.txt"); + + TextLoader.Column[] columns = new[] + { + new TextLoader.Column("Value", valueType, 0, 0) + }; + var dataView = mlContext.Data.LoadFromTextFile(filePath, columns); + + var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Key", "Value"); + var model = pipeline.Fit(dataView); + var mlnetResult = model.Transform(dataView); + + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + var onnxFileName = "ValueToKey.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (IsOnnxRuntimeSupported()) + { + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + + CompareSelectedVectorColumns(model.ColumnPairs[0].outputColumnName, outputNames[1], mlnetResult, onnxResult); + } + + Done(); + } + + private class TextData + { + public string Text { get; set; } + } + + [Fact] + public void WordTokenizerOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + + var samples = new List() + { + new TextData(){ Text = "cat sat on mat" }, + new TextData(){ Text = "mat not fit cat" }, + new TextData(){ Text = "cat think mat bad" }, + }; + + var dataView = mlContext.Data.LoadFromEnumerable(samples); + + var pipe = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text", new[] { ' ' }); + + var model = pipe.Fit(dataView); + var transformedData = model.Transform(dataView); + + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + var onnxFilename = "Tokenizer.onnx"; + var onnxFilePath = GetOutputPath(onnxFilename); + SaveOnnxModel(onnxModel, onnxFilePath, null); + if (IsOnnxRuntimeSupported()) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxFilePath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedVectorColumns>(transformedData.Schema[1].Name, outputNames[1], transformedData, onnxResult); + } + + Done(); + } + + [Theory] + [CombinatorialData] + public void NgramOnnxConnversionTest( + [CombinatorialValues(1, 2, 3)] int ngramLength, + bool useAllLength, + NgramExtractingEstimator.WeightingCriteria weighting) + { + var mlContext = new MLContext(seed: 1); + + var samples = new List() + { + new TextData(){ Text = "cat sat on mat" }, + new TextData(){ Text = "mat not fit cat" }, + new TextData(){ Text = "cat think mat bad" }, + }; + + // Convert training data to IDataView. + var dataView = mlContext.Data.LoadFromEnumerable(samples); + + IEstimator[] pipelines = + { + mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text", new[] { ' ' }) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) + .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens", + ngramLength: ngramLength, + useAllLengths: useAllLength, + weighting: weighting)), + + mlContext.Transforms.Text.ProduceWordBags("Tokens", "Text", + ngramLength: ngramLength, + useAllLengths: useAllLength, + weighting: weighting) + }; + + for (int i = 0; i < pipelines.Length; i++) + { + var pipe = pipelines[i]; + var model = pipe.Fit(dataView); + var transformedData = model.Transform(dataView); + + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + var onnxFilename = $"Ngram-{i}-{ngramLength}-{useAllLength}-{weighting}.onnx"; + var onnxFilePath = GetOutputPath(onnxFilename); + SaveOnnxModel(onnxModel, onnxFilePath, null); + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxFilePath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4VectorColumns(transformedData.Schema[3].Name, outputNames[outputNames.Length-1], transformedData, onnxResult, 3); + } + } + + Done(); + } + + [Fact] + public void OptionalColumnOnnxTest() + { + var mlContext = new MLContext(seed: 1); + + var samples = new List() + { + new BreastCancerCatFeatureExample() { Label = false, F1 = 0.0f, F2 = "F2"}, + new BreastCancerCatFeatureExample() { Label = true, F1 = 0.1f, F2 = "F2"}, + }; + IHostEnvironment env = mlContext as IHostEnvironment; + var dataView = mlContext.Data.LoadFromEnumerable(samples); + var args = new OptionalColumnTransform.Arguments { Columns = new[] { "F1" }, Data = dataView }; + var transform = OptionalColumnTransform.MakeOptional(env, args); + + var ctx = new OnnxContextImpl(mlContext, "model", "ML.NET", "0", 0, "machinelearning.dotnet", OnnxVersion.Stable); + var outputData = transform.OutputData; + LinkedList transforms = null; + ModelProto onnxModel; + using (var ch = env.Start("ONNX conversion")) + { + SaveOnnxCommand.GetPipe(ctx, ch, outputData, out IDataView root, out IDataView sink, out transforms); + onnxModel = SaveOnnxCommand.ConvertTransformListToOnnxModel(ctx, ch, root, sink, transforms, null, null); + } + + var onnxFileName = "optionalcol.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + var onnxTextFileName = "optionalcol.txt"; + var onnxTextPath = GetOutputPath(onnxTextFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, onnxTextPath); + if (IsOnnxRuntimeSupported()) + { + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4ScalarColumns(transform.Model.OutputSchema[2].Name, outputNames[1], outputData, onnxResult); + } + Done(); + } + + [Fact] + public void KeyToValueOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + + string dataPath = GetDataPath("breast-cancer.txt"); + var dataView = mlContext.Data.LoadFromTextFile(dataPath, + separatorChar: '\t', + hasHeader: true); + + var pipeline = mlContext.Transforms.Conversion.MapValueToKey("LabelKey", "Label"). + Append(mlContext.Transforms.Conversion.MapKeyToValue("LabelValue", "LabelKey")); + + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + + var onnxFileName = "KeyToValue.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (IsOnnxRuntimeSupported()) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedScalarColumns>(transformedData.Schema[3].Name, outputNames[3], transformedData, onnxResult); + } + + Done(); + } + + [Fact] + public void MulticlassTrainersOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + + string dataPath = GetDataPath("breast-cancer.txt"); + var dataView = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: '\t', hasHeader: true); + + List> estimators = new List>() + { + mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(), + mlContext.MulticlassClassification.Trainers.NaiveBayes(), + mlContext.MulticlassClassification.Trainers.OneVersusAll( + mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(), useProbabilities:false), + mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy(), + mlContext.MulticlassClassification.Trainers.SdcaNonCalibrated() + }; + + if (Environment.Is64BitProcess) + { + estimators.Add(mlContext.MulticlassClassification.Trainers.LightGbm()); + estimators.Add(mlContext.MulticlassClassification.Trainers.LightGbm( + new LightGbmMulticlassTrainer.Options { UseSoftmax = true })); + } + + var initialPipeline = mlContext.Transforms.ReplaceMissingValues("Features") + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")); + + foreach (var estimator in estimators) + { + var pipeline = initialPipeline.Append(estimator); + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + var onnxFileName = $"{estimator.ToString()}.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, null); + + // Compare results produced by ML.NET and ONNX's runtime. + if (IsOnnxRuntimeSupported()) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedScalarColumns(transformedData.Schema[5].Name, outputNames[2], transformedData, onnxResult); + } + } + Done(); + } + + [Fact] + public void CopyColumnsOnnxTest() + { + var mlContext = new MLContext(seed: 1); + + var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); + var dataView = mlContext.Data.LoadFromTextFile(trainDataPath, + separatorChar: ';', + hasHeader: true); + + var pipeline = mlContext.Transforms.CopyColumns("Target1", "Target"); + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + + var onnxFileName = "copycolumns.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (IsOnnxRuntimeSupported()) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4ScalarColumns(model.ColumnPairs[0].outputColumnName, outputNames[2], transformedData, onnxResult); + } + Done(); + } + +<<<<<<< HEAD + [Fact] + public void FeatureSelectionOnnxTest() + { + var mlContext = new MLContext(seed: 1); + + string dataPath = GetDataPath("breast-cancer.txt"); + + var dataView = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarFloat", DataKind.Single, 6), + new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4), + new TextLoader.Column("VectorDouble", DataKind.Double, 4, 8), + new TextLoader.Column("Label", DataKind.Boolean, 0) + }); + + var columns = new[] { + new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", count: 1), + new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", count: 690), + new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", count: 100), + new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", count: 690), + new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", count: 100) + }; + var pipeline = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("FeatureSelect", "VectorFloat", count: 1) + .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(columns)) + .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelectMIScalarFloat", "ScalarFloat")) + .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelectMIVectorFloat", "VectorFloat")); + + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + + var onnxFileName = "countfeatures.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (IsOnnxRuntimeSupported()) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4ScalarColumns("FeatureSelectMIScalarFloat", "FeatureSelectMIScalarFloat0", transformedData, onnxResult); + CompareSelectedR4VectorColumns("FeatureSelectMIVectorFloat", "FeatureSelectMIVectorFloat0", transformedData, onnxResult); + CompareSelectedR4ScalarColumns("ScalFeatureSelectMissing690", "ScalFeatureSelectMissing6900", transformedData, onnxResult); + CompareSelectedR8VectorColumns("VecFeatureSelectMissing690", "VecFeatureSelectMissing6900", transformedData, onnxResult); + } + Done(); + } + + +||||||| constructed merge base +======= + [Fact] + public void SelectColumnsOnnxTest() + { + var mlContext = new MLContext(seed: 1); + + string dataPath = GetDataPath("breast-cancer.txt"); + + var dataView = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("Label", DataKind.Boolean, 0), + new TextLoader.Column("Thickness", DataKind.Int32, 1), + new TextLoader.Column("Size", DataKind.Int32, 2), + new TextLoader.Column("Shape", DataKind.Int32, 3), + new TextLoader.Column("Adhesion", DataKind.Int32, 4), + new TextLoader.Column("EpithelialSize", DataKind.Int32, 5), + new TextLoader.Column("BareNuclei", DataKind.Single, 6), + new TextLoader.Column("BlandChromatin", DataKind.Int32, 7), + new TextLoader.Column("NormalNucleoli", DataKind.Int32, 8), + new TextLoader.Column("Mitoses", DataKind.Int32, 9), + }); + + var pipeline = mlContext.Transforms.ReplaceMissingValues("BareNuclei") + .Append(mlContext.Transforms.SelectColumns(new[] { "Size", "Shape", "Thickness", "Label" })); + + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + + var onnxFileName = "selectcolumns.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (IsOnnxRuntimeSupported()) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + + Assert.Equal("Size1", outputNames[0]); + Assert.Equal("Shape1", outputNames[1]); + Assert.Equal("Thickness1", outputNames[2]); + Assert.Equal("Label1", outputNames[3]); + + CompareSelectedScalarColumns("Size", "Size1", transformedData, onnxResult); + CompareSelectedScalarColumns("Shape", "Shape1", transformedData, onnxResult); + CompareSelectedScalarColumns("Thickness", "Thickness1", transformedData, onnxResult); + CompareSelectedScalarColumns("Label", "Label1", transformedData, onnxResult); + } + Done(); + } + +>>>>>>> Added onnx export support for SelectColumns + private void CompareResults(string leftColumnName, string rightColumnName, IDataView left, IDataView right) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + var leftType = leftColumn.Type.GetItemType(); + var rightType = rightColumn.Type.GetItemType(); + Assert.Equal(leftType, rightType); + + if (leftType == NumberDataViewType.SByte) + CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); + else if (leftType == NumberDataViewType.Byte) + CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); + else if (leftType == NumberDataViewType.Int16) + CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); + else if (leftType == NumberDataViewType.UInt16) + CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); + else if (leftType == NumberDataViewType.Int32) + CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); + else if (leftType == NumberDataViewType.UInt32) + CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); + else if (leftType == NumberDataViewType.Int64) + CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); + else if (leftType == NumberDataViewType.UInt64) + CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); + else if (leftType == NumberDataViewType.Single) + CompareSelectedR4VectorColumns(leftColumnName, rightColumnName, left, right); + else if (leftType == NumberDataViewType.Double) + CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); + } + + private void CompareSelectedVectorColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + + using (var expectedCursor = left.GetRowCursor(leftColumn)) + using (var actualCursor = right.GetRowCursor(rightColumn)) + { + VBuffer expected = default; + VBuffer actual = default; + var expectedGetter = expectedCursor.GetGetter>(leftColumn); + var actualGetter = actualCursor.GetGetter>(rightColumn); + while (expectedCursor.MoveNext() && actualCursor.MoveNext()) + { + expectedGetter(ref expected); + actualGetter(ref actual); + + Assert.Equal(expected.Length, actual.Length); + for (int i = 0; i < expected.Length; ++i) + if (typeof(T) == typeof(ReadOnlyMemory)) + Assert.Equal(expected.GetItemOrDefault(i).ToString(), actual.GetItemOrDefault(i).ToString()); + else + Assert.Equal(expected.GetItemOrDefault(i), actual.GetItemOrDefault(i)); + } + } + } + + private void CompareSelectedR8VectorColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right, int precision = 6) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + + using (var expectedCursor = left.GetRowCursor(leftColumn)) + using (var actualCursor = right.GetRowCursor(rightColumn)) + { + VBuffer expected = default; + VBuffer actual = default; + var expectedGetter = expectedCursor.GetGetter>(leftColumn); + var actualGetter = actualCursor.GetGetter>(rightColumn); + while (expectedCursor.MoveNext() && actualCursor.MoveNext()) + { + expectedGetter(ref expected); + actualGetter(ref actual); + + Assert.Equal(expected.Length, actual.Length); + for (int i = 0; i < expected.Length; ++i) + Assert.Equal(expected.GetItemOrDefault(i), actual.GetItemOrDefault(i), precision); + } + } + } + + private void CompareSelectedR4VectorColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right, int precision = 6) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + + using (var expectedCursor = left.GetRowCursor(leftColumn)) + using (var actualCursor = right.GetRowCursor(rightColumn)) + { + VBuffer expected = default; + VBuffer actual = default; + var expectedGetter = expectedCursor.GetGetter>(leftColumn); + var actualGetter = actualCursor.GetGetter>(rightColumn); + while (expectedCursor.MoveNext() && actualCursor.MoveNext()) + { + expectedGetter(ref expected); + actualGetter(ref actual); + + Assert.Equal(expected.Length, actual.Length); + for (int i = 0; i < expected.Length; ++i) + { + // We are using float values. But the Assert.Equal function takes doubles. + // And sometimes the converted doubles are different in their precision. + // So make sure we compare floats + float exp = expected.GetItemOrDefault(i); + float act = actual.GetItemOrDefault(i); + CompareNumbersWithTolerance(exp, act, null, precision); + } + } + } + } + + private void CompareSelectedR4ScalarColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right, int precision = 6) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + + using (var expectedCursor = left.GetRowCursor(leftColumn)) + using (var actualCursor = right.GetRowCursor(rightColumn)) + { + float expected = default; + VBuffer actual = default; + var expectedGetter = expectedCursor.GetGetter(leftColumn); + var actualGetter = actualCursor.GetGetter>(rightColumn); + while (expectedCursor.MoveNext() && actualCursor.MoveNext()) + { + expectedGetter(ref expected); + actualGetter(ref actual); + + // Scalar such as R4 (float) is converted to [1, 1]-tensor in ONNX format for consitency of making batch prediction. + Assert.Equal(1, actual.Length); + CompareNumbersWithTolerance(expected, actual.GetItemOrDefault(0), null, precision); + } + } + } + + private void CompareSelectedScalarColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + + using (var expectedCursor = left.GetRowCursor(leftColumn)) + using (var actualCursor = right.GetRowCursor(rightColumn)) + { + T expected = default; + VBuffer actual = default; + var expectedGetter = expectedCursor.GetGetter(leftColumn); + var actualGetter = actualCursor.GetGetter>(rightColumn); + while (expectedCursor.MoveNext() && actualCursor.MoveNext()) + { + expectedGetter(ref expected); + actualGetter(ref actual); + var actualVal = actual.GetItemOrDefault(0); + + Assert.Equal(1, actual.Length); + + if (typeof(T) == typeof(ReadOnlyMemory)) + Assert.Equal(expected.ToString(), actualVal.ToString()); + else + Assert.Equal(expected, actualVal); + } + } + } + + private void SaveOnnxModel(ModelProto model, string binaryFormatPath, string textFormatPath) + { + DeleteOutputPath(binaryFormatPath); // Clean if such a file exists. + DeleteOutputPath(textFormatPath); + + if (binaryFormatPath != null) + using (var file = Env.CreateOutputFile(binaryFormatPath)) + using (var stream = file.CreateWriteStream()) + model.WriteTo(stream); + + if (textFormatPath != null) + { + using (var file = Env.CreateOutputFile(textFormatPath)) + using (var stream = file.CreateWriteStream()) + using (var writer = new StreamWriter(stream)) + { + var parsedJson = JsonConvert.DeserializeObject(model.ToString()); + writer.Write(JsonConvert.SerializeObject(parsedJson, Formatting.Indented)); + } + + // Strip the version information. + var fileText = File.ReadAllText(textFormatPath); + + fileText = Regex.Replace(fileText, "\"producerVersion\": \".*\"", "\"producerVersion\": \"##VERSION##\""); + File.WriteAllText(textFormatPath, fileText); + } + } + } +} From 8b26e17f01dcabb27f098cdefe0e01c37fd195a9 Mon Sep 17 00:00:00 2001 From: "Harish S. Kulkarni" Date: Wed, 18 Dec 2019 11:07:31 -0800 Subject: [PATCH 2/5] Deleted file that was added accidentally --- .../OnnxConversionTest.cs.orig | 1560 ----------------- 1 file changed, 1560 deletions(-) delete mode 100644 test/Microsoft.ML.Tests/OnnxConversionTest.cs.orig diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs.orig b/test/Microsoft.ML.Tests/OnnxConversionTest.cs.orig deleted file mode 100644 index b944d047e6..0000000000 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs.orig +++ /dev/null @@ -1,1560 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Runtime.InteropServices; -using System.Text.RegularExpressions; -using Google.Protobuf; -using Microsoft.ML.Data; -using Microsoft.ML.EntryPoints; -using Microsoft.ML.Model.OnnxConverter; -using Microsoft.ML.RunTests; -using Microsoft.ML.Runtime; -using Microsoft.ML.TestFramework.Attributes; -using Microsoft.ML.TestFrameworkCommon; -using Microsoft.ML.TestFrameworkCommon.Attributes; -using Microsoft.ML.Tools; -using Microsoft.ML.Trainers; -using Microsoft.ML.Trainers.LightGbm; -using Microsoft.ML.Transforms; -using Microsoft.ML.Transforms.Onnx; -using Microsoft.ML.Transforms.Text; -using Newtonsoft.Json; -using Xunit; -using Xunit.Abstractions; -using static Microsoft.ML.Model.OnnxConverter.OnnxCSharpToProtoWrapper; - -#pragma warning disable CS0649 // Field 'fieldName' is never assigned to, and will always have its default value null - -namespace Microsoft.ML.Tests -{ - public class OnnxConversionTest : BaseTestBaseline - { - private class AdultData - { - [LoadColumn(0, 10), ColumnName("FeatureVector")] - public float Features { get; set; } - - [LoadColumn(11)] - public float Target { get; set; } - } - - public OnnxConversionTest(ITestOutputHelper output) : base(output) - { - } - - private bool IsOnnxRuntimeSupported() - { - return OnnxFactAttribute.IsOnnxRuntimeSupported; - } - - /// - /// In this test, we convert a trained into ONNX file and then - /// call to evaluate that file. The outputs of are checked against the original - /// ML.NET model's outputs. - /// - [Fact] - public void SimpleEndToEndOnnxConversionTest() - { - // Step 1: Create and train a ML.NET pipeline. - var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var mlContext = new MLContext(seed: 1); - var data = mlContext.Data.LoadFromTextFile(trainDataPath, - separatorChar: ';' -, - hasHeader: true); - var cachedTrainData = mlContext.Data.Cache(data); - var dynamicPipeline = - mlContext.Transforms.NormalizeMinMax("FeatureVector") - .AppendCacheCheckpoint(mlContext) - .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() { - LabelColumnName = "Target", - FeatureColumnName = "FeatureVector", - NumberOfThreads = 1 - })); - var model = dynamicPipeline.Fit(data); - var transformedData = model.Transform(data); - - // Step 2: Convert ML.NET model to ONNX format and save it as a file. - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - var onnxFileName = "model.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) - { - // Step 3: Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(data); - var onnxResult = onnxTransformer.Transform(data); - - // Step 4: Compare ONNX and ML.NET results. - CompareSelectedR4ScalarColumns("Score", "Score0", transformedData, onnxResult, 1); - } - - // Step 5: Check ONNX model's text format. This test will be not necessary if Step 3 and Step 4 can run on Linux and - // Mac to support cross-platform tests. - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Regression", "Adult"); - var onnxTextName = "SimplePipeline.txt"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - SaveOnnxModel(onnxModel, null, onnxTextPath); - CheckEquality(subDir, onnxTextName, digitsOfPrecision: 3); - - Done(); - } - - private class BreastCancerFeatureVector - { - [LoadColumn(1, 9), VectorType(9)] - public float[] Features; - } - - private class BreastCancerCatFeatureExample - { - [LoadColumn(0)] - public bool Label; - - [LoadColumn(1)] - public float F1; - - [LoadColumn(2)] - public string F2; - } - - private class BreastCancerMulticlassExample - { - [LoadColumn(1)] - public string Label; - - [LoadColumn(2, 9), VectorType(8)] - public float[] Features; - } - - private class BreastCancerBinaryClassification - { - [LoadColumn(0)] - public bool Label; - - [LoadColumn(2, 9), VectorType(8)] - public float[] Features; - } - - [LessThanNetCore30OrNotNetCoreFact("netcoreapp3.0 output differs from Baseline. Tracked by https://github.com/dotnet/machinelearning/issues/2087")] - public void KmeansOnnxConversionTest() - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(seed: 1); - - string dataPath = GetDataPath("breast-cancer.txt"); - // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). - var data = mlContext.Data.LoadFromTextFile(dataPath, - separatorChar: '\t', - hasHeader: true); - - var pipeline = mlContext.Transforms.NormalizeMinMax("Features"). - Append(mlContext.Clustering.Trainers.KMeans(new Trainers.KMeansTrainer.Options - { - FeatureColumnName = DefaultColumnNames.Features, - MaximumNumberOfIterations = 1, - NumberOfClusters = 4, - NumberOfThreads = 1, - InitializationAlgorithm = Trainers.KMeansTrainer.InitializationAlgorithm.Random - })); - - var model = pipeline.Fit(data); - var transformedData = model.Transform(data); - - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - - // Compare results produced by ML.NET and ONNX's runtime. - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) - { - var onnxFileName = "model.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); - - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(data); - var onnxResult = onnxTransformer.Transform(data); - CompareSelectedR4VectorColumns("Score", "Score0", transformedData, onnxResult, 3); - } - - // Check ONNX model's text format. We save the produced ONNX model as a text file and compare it against - // the associated file in ML.NET repo. Such a comparison can be retired if ONNXRuntime ported to ML.NET - // can support Linux and Mac. - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Cluster", "BreastCancer"); - var onnxTextName = "Kmeans.txt"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - SaveOnnxModel(onnxModel, null, onnxTextPath); - CheckEquality(subDir, onnxTextName, digitsOfPrecision: 2); - Done(); - } - - [Fact] - public void RegressionTrainersOnnxConversionTest() - { - var mlContext = new MLContext(seed: 1); - string dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - - // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). - var dataView = mlContext.Data.LoadFromTextFile(dataPath, - separatorChar: ';', - hasHeader: true); - List> estimators = new List>() - { - mlContext.Regression.Trainers.Sdca("Target","FeatureVector"), - mlContext.Regression.Trainers.Ols("Target","FeatureVector"), - mlContext.Regression.Trainers.OnlineGradientDescent("Target","FeatureVector"), - mlContext.Regression.Trainers.FastForest("Target", "FeatureVector"), - mlContext.Regression.Trainers.FastTree("Target", "FeatureVector"), - mlContext.Regression.Trainers.FastTreeTweedie("Target", "FeatureVector"), - mlContext.Regression.Trainers.LbfgsPoissonRegression("Target", "FeatureVector"), - }; - if (Environment.Is64BitProcess) - { - estimators.Add(mlContext.Regression.Trainers.LightGbm("Target", "FeatureVector")); - } - foreach (var estimator in estimators) - { - var model = estimator.Fit(dataView); - var transformedData = model.Transform(dataView); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - // Compare model scores produced by ML.NET and ONNX's runtime - if (IsOnnxRuntimeSupported()) - { - var onnxFileName = $"{estimator.ToString()}.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedR4ScalarColumns(transformedData.Schema[2].Name, outputNames[2], transformedData, onnxResult, 3); // compare score results - } - // Compare the Onnx graph to a baseline if OnnxRuntime is not supported - else - { - var onnxFileName = $"{estimator.ToString()}.txt"; - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Regression", "Adult"); - var onnxTextModelPath = GetOutputPath(subDir, onnxFileName); - SaveOnnxModel(onnxModel, null, onnxTextModelPath); - CheckEquality(subDir, onnxFileName, digitsOfPrecision: 1); - } - } - Done(); - } - - [Fact] - public void BinaryClassificationTrainersOnnxConversionTest() - { - var mlContext = new MLContext(seed: 1); - string dataPath = GetDataPath("breast-cancer.txt"); - // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). - var dataView = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: '\t', hasHeader: true); - List> estimators = new List>() - { - mlContext.BinaryClassification.Trainers.AveragedPerceptron(), - mlContext.BinaryClassification.Trainers.FastForest(), - mlContext.BinaryClassification.Trainers.FastTree(), - mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(), - mlContext.BinaryClassification.Trainers.LinearSvm(), - mlContext.BinaryClassification.Trainers.Prior(), - mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(), - mlContext.BinaryClassification.Trainers.SdcaNonCalibrated(), - mlContext.BinaryClassification.Trainers.SgdCalibrated(), - mlContext.BinaryClassification.Trainers.SgdNonCalibrated(), - mlContext.BinaryClassification.Trainers.SymbolicSgdLogisticRegression(), - }; - if (Environment.Is64BitProcess) - { - estimators.Add(mlContext.BinaryClassification.Trainers.LightGbm()); - } - - var initialPipeline = mlContext.Transforms.ReplaceMissingValues("Features"). - Append(mlContext.Transforms.NormalizeMinMax("Features")); - foreach (var estimator in estimators) - { - var pipeline = initialPipeline.Append(estimator); - var model = pipeline.Fit(dataView); - var transformedData = model.Transform(dataView); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - // Compare model scores produced by ML.NET and ONNX's runtime. - if (IsOnnxRuntimeSupported()) - { - var onnxFileName = $"{estimator.ToString()}.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedR4ScalarColumns(transformedData.Schema[5].Name, outputNames[3], transformedData, onnxResult, 3); //compare scores - CompareSelectedScalarColumns(transformedData.Schema[4].Name, outputNames[2], transformedData, onnxResult); //compare predicted labels - } - } - Done(); - } - - private class DataPoint - { - [VectorType(3)] - public float[] Features { get; set; } - } - - [Theory] - [CombinatorialData] - public void LpNormOnnxConversionTest( - bool ensureZeroMean, - LpNormNormalizingEstimatorBase.NormFunction norm) - { - var mlContext = new MLContext(seed: 1); - - var samples = new List() - { - new DataPoint() { Features = new float[3] {0.01f, 0.02f, 0.03f} }, - new DataPoint() { Features = new float[3] {0.04f, 0.05f, 0.06f} }, - new DataPoint() { Features = new float[3] {0.07f, 0.08f, 0.09f} }, - new DataPoint() { Features = new float[3] {0.10f, 0.11f, 0.12f} }, - new DataPoint() { Features = new float[3] {0.13f, 0.14f, 0.15f} } - }; - var dataView = mlContext.Data.LoadFromEnumerable(samples); - - var pipe = mlContext.Transforms.NormalizeLpNorm(nameof(DataPoint.Features), norm:norm, ensureZeroMean: ensureZeroMean); - - var model = pipe.Fit(dataView); - var transformedData = model.Transform(dataView); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - - var onnxFileName = $"LpNorm-{norm.ToString()}-{ensureZeroMean}.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - - SaveOnnxModel(onnxModel, onnxModelPath, null); - - // Compare results produced by ML.NET and ONNX's runtime. - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedR4VectorColumns(nameof(DataPoint.Features), outputNames[0], transformedData, onnxResult, 3); - } - - Done(); - } - - [Fact] - public void CommandLineOnnxConversionTest() - { - string dataPath = GetDataPath("breast-cancer.txt"); - string modelPath = GetOutputPath("ModelWithLessIO.zip"); - var trainingPathArgs = $"data={dataPath} out={modelPath}"; - var trainingArgs = " loader=text{col=Label:BL:0 col=F1:R4:1-8 col=F2:TX:9} xf=Cat{col=F2} xf=Concat{col=Features:F1,F2} tr=ft{numberOfThreads=1 numberOfLeaves=8 numberOfTrees=3} seed=1"; - Assert.Equal(0, Maml.Main(new[] { "train " + trainingPathArgs + trainingArgs })); - - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); - var onnxTextName = "ModelWithLessIO.txt"; - var onnxFileName = "ModelWithLessIO.onnx"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - var onnxFilePath = GetOutputPath(subDir, onnxFileName); - string conversionCommand = $"saveonnx in={modelPath} onnx={onnxFilePath} json={onnxTextPath} domain=machinelearning.dotnet name=modelWithLessIO inputsToDrop=Label outputsToDrop=F1,F2,Features,Label"; - Assert.Equal(0, Maml.Main(new[] { conversionCommand })); - - var fileText = File.ReadAllText(onnxTextPath); - fileText = Regex.Replace(fileText, "\"producerVersion\": \".*\"", "\"producerVersion\": \"##VERSION##\""); - File.WriteAllText(onnxTextPath, fileText); - - CheckEquality(subDir, onnxTextName); - Done(); - } - - [Fact] - public void KeyToVectorWithBagOnnxConversionTest() - { - var mlContext = new MLContext(seed: 1); - - string dataPath = GetDataPath("breast-cancer.txt"); - - var data = mlContext.Data.LoadFromTextFile(dataPath, - separatorChar: '\t', - hasHeader: true); - - var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", Transforms.OneHotEncodingEstimator.OutputKind.Bag) - .Append(mlContext.Transforms.ReplaceMissingValues(new MissingValueReplacingEstimator.ColumnOptions("F2"))) - .Append(mlContext.Transforms.Concatenate("Features", "F1", "F2")) - .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); - - var model = pipeline.Fit(data); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - - // Check ONNX model's text format. We save the produced ONNX model as a text file and compare it against - // the associated file in ML.NET repo. Such a comparison can be retired if ONNXRuntime ported to ML.NET - // can support Linux and Mac. - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); - var onnxTextName = "OneHotBagPipeline.txt"; - var onnxFileName = "OneHotBagPipeline.onnx"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - var onnxFilePath = GetOutputPath(subDir, onnxFileName); - SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); - CheckEquality(subDir, onnxTextName); - Done(); - } - - [Fact] - public void InitializerCreationTest() - { - var env = new MLContext(); - // Create the actual implementation - var ctxImpl = new OnnxContextImpl(env, "model", "ML.NET", "0", 0, "com.test", Model.OnnxConverter.OnnxVersion.Stable); - - // Use implementation as in the actual conversion code - var ctx = ctxImpl as OnnxContext; - ctx.AddInitializer(9.4f, "float"); - ctx.AddInitializer(17L, "int64"); - ctx.AddInitializer("36", "string"); - ctx.AddInitializer(new List { 9.4f, 1.7f, 3.6f }, new List { 1, 3 }, "floats"); - ctx.AddInitializer(new List { 94L, 17L, 36L }, new List { 1, 3 }, "int64s"); - ctx.AddInitializer(new List { "94", "17", "36" }, new List { 1, 3 }, "strings"); - - var model = ctxImpl.MakeModel(); - - var floatScalar = model.Graph.Initializer[0]; - Assert.True(floatScalar.Name == "float"); - Assert.True(floatScalar.Dims.Count == 0); - Assert.True(floatScalar.FloatData.Count == 1); - Assert.True(floatScalar.FloatData[0] == 9.4f); - - var int64Scalar = model.Graph.Initializer[1]; - Assert.True(int64Scalar.Name == "int64"); - Assert.True(int64Scalar.Dims.Count == 0); - Assert.True(int64Scalar.Int64Data.Count == 1); - Assert.True(int64Scalar.Int64Data[0] == 17L); - - var stringScalar = model.Graph.Initializer[2]; - Assert.True(stringScalar.Name == "string"); - Assert.True(stringScalar.Dims.Count == 0); - Assert.True(stringScalar.StringData.Count == 1); - Assert.True(stringScalar.StringData[0].ToStringUtf8() == "36"); - - var floatsTensor = model.Graph.Initializer[3]; - Assert.True(floatsTensor.Name == "floats"); - Assert.True(floatsTensor.Dims.Count == 2); - Assert.True(floatsTensor.Dims[0] == 1); - Assert.True(floatsTensor.Dims[1] == 3); - Assert.True(floatsTensor.FloatData.Count == 3); - Assert.True(floatsTensor.FloatData[0] == 9.4f); - Assert.True(floatsTensor.FloatData[1] == 1.7f); - Assert.True(floatsTensor.FloatData[2] == 3.6f); - - var int64sTensor = model.Graph.Initializer[4]; - Assert.True(int64sTensor.Name == "int64s"); - Assert.True(int64sTensor.Dims.Count == 2); - Assert.True(int64sTensor.Dims[0] == 1); - Assert.True(int64sTensor.Dims[1] == 3); - Assert.True(int64sTensor.Int64Data.Count == 3); - Assert.True(int64sTensor.Int64Data[0] == 94L); - Assert.True(int64sTensor.Int64Data[1] == 17L); - Assert.True(int64sTensor.Int64Data[2] == 36L); - - var stringsTensor = model.Graph.Initializer[5]; - Assert.True(stringsTensor.Name == "strings"); - Assert.True(stringsTensor.Dims.Count == 2); - Assert.True(stringsTensor.Dims[0] == 1); - Assert.True(stringsTensor.Dims[1] == 3); - Assert.True(stringsTensor.StringData.Count == 3); - Assert.True(stringsTensor.StringData[0].ToStringUtf8() == "94"); - Assert.True(stringsTensor.StringData[1].ToStringUtf8() == "17"); - Assert.True(stringsTensor.StringData[2].ToStringUtf8() == "36"); - } - - [Fact] - public void LogisticRegressionOnnxConversionTest() - { - // Step 1: Create and train a ML.NET pipeline. - var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var mlContext = new MLContext(seed: 1); - var data = mlContext.Data.LoadFromTextFile(trainDataPath, - separatorChar: ';' -, - hasHeader: true); - var cachedTrainData = mlContext.Data.Cache(data); - var dynamicPipeline = - mlContext.Transforms.NormalizeMinMax("FeatureVector") - .AppendCacheCheckpoint(mlContext) - .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() { - LabelColumnName = "Target", - FeatureColumnName = "FeatureVector", - NumberOfThreads = 1 - })); - var model = dynamicPipeline.Fit(data); - - // Step 2: Convert ML.NET model to ONNX format and save it as a file. - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - - // Step 3: Save ONNX model as binary and text files. - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); - var onnxFileName = "LogisticRegressionSaveModelToOnnxTest.onnx"; - var onnxFilePath = GetOutputPath(subDir, onnxFileName); - var onnxTextName = "LogisticRegressionSaveModelToOnnxTest.txt"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); - - // Step 4: Check ONNX model's text format. - CheckEquality(subDir, onnxTextName, digitsOfPrecision: 3); - Done(); - } - - [LightGBMFact] - public void LightGbmBinaryClassificationOnnxConversionTest() - { - // Step 1: Create and train a ML.NET pipeline. - var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var mlContext = new MLContext(seed: 1); - var data = mlContext.Data.LoadFromTextFile(trainDataPath, - separatorChar: ';' -, - hasHeader: true); - var cachedTrainData = mlContext.Data.Cache(data); - var dynamicPipeline = - mlContext.Transforms.NormalizeMinMax("FeatureVector") - .AppendCacheCheckpoint(mlContext) - .Append(mlContext.Regression.Trainers.LightGbm(labelColumnName: "Target", featureColumnName: "FeatureVector", numberOfIterations: 3, numberOfLeaves: 16, minimumExampleCountPerLeaf: 100)); - var model = dynamicPipeline.Fit(data); - - // Step 2: Convert ML.NET model to ONNX format and save it as a file. - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - - // Step 3: Save ONNX model as binary and text files. - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); - var onnxFileName = "LightGbmBinaryClassificationOnnxConversionTest.onnx"; - var onnxFilePath = GetOutputPath(subDir, onnxFileName); - var onnxTextName = "LightGbmBinaryClassificationOnnxConversionTest.txt"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); - - // Step 4: Check ONNX model's text format. - CheckEquality(subDir, onnxTextName, digitsOfPrecision: 3); - Done(); - } - - [Fact] - public void MulticlassLogisticRegressionOnnxConversionTest() - { - var mlContext = new MLContext(seed: 1); - - string dataPath = GetDataPath("breast-cancer.txt"); - var data = mlContext.Data.LoadFromTextFile(dataPath, - separatorChar: '\t', - hasHeader: true); - - var pipeline = mlContext.Transforms.NormalizeMinMax("Features"). - Append(mlContext.Transforms.Conversion.MapValueToKey("Label")). - Append(mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(new LbfgsMaximumEntropyMulticlassTrainer.Options() { NumberOfThreads = 1 })); - - var model = pipeline.Fit(data); - var transformedData = model.Transform(data); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "MultiClassClassification", "BreastCancer"); - var onnxFileName = "MultiClassificationLogisticRegressionSaveModelToOnnxTest.onnx"; - var onnxFilePath = GetOutputPath(subDir, onnxFileName); - var onnxTextName = "MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - - SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); - - CheckEquality(subDir, onnxTextName, digitsOfPrecision: 2); - Done(); - } - - [Fact] - public void LoadingPredictorModelAndOnnxConversionTest() - { - string dataPath = GetDataPath("iris.txt"); - string modelPath = Path.GetTempPath() + Guid.NewGuid().ToString() + ".model.bin"; - string onnxPath = Path.GetTempPath() + Guid.NewGuid().ToString() + ".model.onnx"; - string onnxJsonPath = Path.GetTempPath() + Guid.NewGuid().ToString() + ".model.onnx.json"; - - string inputGraph = string.Format(@" - {{ - 'Inputs': {{ - 'inputFile': '{0}' - }}, - 'Nodes': [ - {{ - 'Name': 'Data.TextLoader', - 'Inputs': - {{ - 'InputFile': '$inputFile', - 'Arguments': - {{ - 'UseThreads': true, - 'HeaderFile': null, - 'MaxRows': null, - 'AllowQuoting': true, - 'AllowSparse': true, - 'InputSize': null, - 'TrimWhitespace': false, - 'HasHeader': false, - 'Column': - [ - {{'Name':'Sepal_Width','Type':null,'Source':[{{'Min':2,'Max':2,'AutoEnd':false,'VariableEnd':false,'AllOther':false,'ForceVector':false}}],'KeyCount':null}}, - {{'Name':'Petal_Length','Type':null,'Source':[{{'Min':3,'Max':4,'AutoEnd':false,'VariableEnd':false,'AllOther':false,'ForceVector':false}}],'KeyCount':null}}, - ] - }} - }}, - 'Outputs': - {{ - 'Data': '$training_data' - }} - }}, - {{ - 'Inputs': {{ - 'FeatureColumnName': 'Petal_Length', - 'LabelColumnName': 'Sepal_Width', - 'TrainingData': '$training_data', - }}, - 'Name': 'Trainers.StochasticDualCoordinateAscentRegressor', - 'Outputs': {{ - 'PredictorModel': '$output_model' - }} - }} - ], - 'Outputs': {{ - 'output_model': '{1}' - }} - }}", dataPath.Replace("\\", "\\\\"), modelPath.Replace("\\", "\\\\")); - - // Write entry point graph into file so that it can be invoke by graph runner below. - var jsonPath = DeleteOutputPath("graph.json"); - File.WriteAllLines(jsonPath, new[] { inputGraph }); - - // Execute the saved entry point graph to produce a predictive model. - var args = new ExecuteGraphCommand.Arguments() { GraphPath = jsonPath }; - var cmd = new ExecuteGraphCommand(Env, args); - cmd.Run(); - - // Make entry point graph to conduct ONNX conversion. - inputGraph = string.Format(@" - {{ - 'Inputs': {{ - 'model': '{0}' - }}, - 'Nodes': [ - {{ - 'Inputs': {{ - 'Domain': 'com.microsoft.models', - 'Json': '{1}', - 'PredictiveModel': '$model', - 'Onnx': '{2}', - 'OnnxVersion': 'Experimental' - }}, - 'Name': 'Models.OnnxConverter', - 'Outputs': {{}} - }} - ], - 'Outputs': {{}} - }} - ", modelPath.Replace("\\", "\\\\"), onnxJsonPath.Replace("\\", "\\\\"), onnxPath.Replace("\\", "\\\\")); - - // Write entry point graph for ONNX conversion into file so that it can be invoke by graph runner below. - jsonPath = DeleteOutputPath("graph.json"); - File.WriteAllLines(jsonPath, new[] { inputGraph }); - - // Onnx converter's assembly is not loaded by default, so we need to register it before calling it. - Env.ComponentCatalog.RegisterAssembly(typeof(OnnxExportExtensions).Assembly); - - // Execute the saved entry point graph to convert the saved model to ONNX format. - args = new ExecuteGraphCommand.Arguments() { GraphPath = jsonPath }; - cmd = new ExecuteGraphCommand(Env, args); - cmd.Run(); - - // Load the resulted ONNX model from the file so that we can check if the conversion looks good. - var model = new OnnxCSharpToProtoWrapper.ModelProto(); - using (var modelStream = File.OpenRead(onnxPath)) - model = OnnxCSharpToProtoWrapper.ModelProto.Parser.ParseFrom(modelStream); - - // Make sure a PredictorModel is loaded by seeing if a predictive model exists. In this the - // predictive model is "LinearRegressor" (converted from StochasticDualCoordinateAscentRegressor - // in the original training entry-point graph. - Assert.Equal("Scaler", model.Graph.Node[0].OpType); - Assert.Equal("LinearRegressor", model.Graph.Node[1].OpType); - - File.Delete(modelPath); - File.Delete(onnxPath); - File.Delete(onnxJsonPath); - - Done(); - } - - - [Fact] - public void RemoveVariablesInPipelineTest() - { - var mlContext = new MLContext(seed: 1); - - string dataPath = GetDataPath("breast-cancer.txt"); - var data = mlContext.Data.LoadFromTextFile(dataPath, - separatorChar: '\t', - hasHeader: true); - - var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", Transforms.OneHotEncodingEstimator.OutputKind.Bag) - .Append(mlContext.Transforms.ReplaceMissingValues(new MissingValueReplacingEstimator.ColumnOptions("F2"))) - .Append(mlContext.Transforms.Concatenate("Features", "F1", "F2")) - .Append(mlContext.Transforms.NormalizeMinMax("Features")) - .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); - - var model = pipeline.Fit(data); - var transformedData = model.Transform(data); - - var onnxConversionContext = new OnnxContextImpl(mlContext, "A Simple Pipeline", "ML.NET", "0", 0, "machinelearning.dotnet", OnnxVersion.Stable); - - LinkedList transforms = null; - using (var conversionChannel = (mlContext as IChannelProvider).Start("ONNX conversion")) - { - SaveOnnxCommand.GetPipe(onnxConversionContext, conversionChannel, transformedData, out IDataView root, out IDataView sink, out transforms); - // Input columns' names to be excluded in the resulted ONNX model. - var redundantInputColumnNames = new HashSet { "Label" }; - // Output columns' names to be excluded in the resulted ONNX model. - var redundantOutputColumnNames = new HashSet { "Label", "F1", "F2", "Features" }; - var onnxModel = SaveOnnxCommand.ConvertTransformListToOnnxModel(onnxConversionContext, conversionChannel, root, sink, transforms, - redundantInputColumnNames, redundantOutputColumnNames); - - // Check ONNX model's text format. We save the produced ONNX model as a text file and compare it against - // the associated file in ML.NET repo. Such a comparison can be retired if ONNXRuntime ported to ML.NET - // can support Linux and Mac. - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); - var onnxTextName = "ExcludeVariablesInOnnxConversion.txt"; - var onnxFileName = "ExcludeVariablesInOnnxConversion.onnx"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - var onnxFilePath = GetOutputPath(subDir, onnxFileName); - SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); - CheckEquality(subDir, onnxTextName, digitsOfPrecision: 3); - } - Done(); - } - - private class SmallSentimentExample - { - [LoadColumn(0, 3), VectorType(4)] - public string[] Tokens; - } - - [Fact] - public void WordEmbeddingsTest() - { - var mlContext = new MLContext(seed: 1); - var dataPath = GetDataPath(@"small-sentiment-test.tsv"); - var embedNetworkPath = GetDataPath(@"shortsentiment.emd"); - var data = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: '\t', hasHeader: false); - - var pipeline = mlContext.Transforms.Text.ApplyWordEmbedding("Embed", embedNetworkPath, "Tokens"); - var model = pipeline.Fit(data); - var transformedData = model.Transform(data); - - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Transforms", "Sentiment"); - var onnxTextName = "SmallWordEmbed.txt"; - var onnxFileName = "SmallWordEmbed.onnx"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - var onnxFilePath = GetOutputPath(subDir, onnxFileName); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); - - CheckEquality(subDir, onnxTextName, parseOption: NumberParseOption.UseSingle); - Done(); - } - - [Theory] - // These are the supported conversions - // ML.NET does not allow any conversions between signed and unsigned numeric types - // Onnx does not seem to support casting a string to any type - // Though the onnx docs claim support for byte and sbyte, - // CreateNamedOnnxValue in OnnxUtils.cs throws a NotImplementedException for those two - [InlineData(DataKind.Int16, DataKind.Int16)] - [InlineData(DataKind.Int16, DataKind.Int32)] - [InlineData(DataKind.Int16, DataKind.Int64)] - [InlineData(DataKind.Int16, DataKind.Single)] - [InlineData(DataKind.Int16, DataKind.Double)] - [InlineData(DataKind.UInt16, DataKind.UInt16)] - [InlineData(DataKind.UInt16, DataKind.UInt32)] - [InlineData(DataKind.UInt16, DataKind.UInt64)] - [InlineData(DataKind.UInt16, DataKind.Single)] - [InlineData(DataKind.UInt16, DataKind.Double)] - [InlineData(DataKind.Int32, DataKind.Int16)] - [InlineData(DataKind.Int32, DataKind.Int32)] - [InlineData(DataKind.Int32, DataKind.Int64)] - [InlineData(DataKind.Int32, DataKind.Single)] - [InlineData(DataKind.Int32, DataKind.Double)] - [InlineData(DataKind.Int64, DataKind.Int16)] - [InlineData(DataKind.Int64, DataKind.Int32)] - [InlineData(DataKind.Int64, DataKind.Int64)] - [InlineData(DataKind.Int64, DataKind.Single)] - [InlineData(DataKind.Int64, DataKind.Double)] - [InlineData(DataKind.UInt64, DataKind.UInt16)] - [InlineData(DataKind.UInt64, DataKind.UInt32)] - [InlineData(DataKind.UInt64, DataKind.UInt64)] - [InlineData(DataKind.UInt64, DataKind.Single)] - [InlineData(DataKind.UInt64, DataKind.Double)] - [InlineData(DataKind.Single, DataKind.Single)] - [InlineData(DataKind.Single, DataKind.Double)] - [InlineData(DataKind.Double, DataKind.Single)] - [InlineData(DataKind.Double, DataKind.Double)] - public void OnnxTypeConversionTest(DataKind fromKind, DataKind toKind) - { - var mlContext = new MLContext(seed: 1); - string filePath = GetDataPath("type-conversion.txt"); - - TextLoader.Column[] columns = new [] - { - new TextLoader.Column("Value", fromKind, 0, 0) - }; - var dataView = mlContext.Data.LoadFromTextFile(filePath, columns); - - var pipeline = mlContext.Transforms.Conversion.ConvertType("ValueConverted", "Value", outputKind: toKind); - var model = pipeline.Fit(dataView); - var mlnetResult = model.Transform(dataView); - - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - var onnxFileName = "typeconversion.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) - { - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - - CompareResults(model.ColumnPairs[0].outputColumnName, outputNames[1], mlnetResult, onnxResult); - } - - Done(); - } - - [Fact] - public void PcaOnnxConversionTest() - { - var dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - - var mlContext = new MLContext(seed: 1); - var dataView = mlContext.Data.LoadFromTextFile(dataSource, new[] { - new TextLoader.Column("label", DataKind.Single, 11), - new TextLoader.Column("features", DataKind.Single, 0, 10) - }, hasHeader: true, separatorChar: ';'); - - bool[] zeroMeans = { true, false }; - foreach (var zeroMean in zeroMeans) - { - var pipeline = ML.Transforms.ProjectToPrincipalComponents("pca", "features", rank: 5, seed: 1, ensureZeroMean: zeroMean); - var model = pipeline.Fit(dataView); - var transformedData = model.Transform(dataView); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - - var onnxFileName = "pca.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - - SaveOnnxModel(onnxModel, onnxModelPath, null); - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedR4VectorColumns(model.ColumnPairs[0].outputColumnName, outputNames[2], transformedData, onnxResult); - } - } - - Done(); - } - - private class TransformedDataPoint : DataPoint, IEquatable - { - [VectorType(3)] - public int[] MissingIndicator { get; set; } - - public bool Equals(TransformedDataPoint other) - { - return Enumerable.SequenceEqual(MissingIndicator, other.MissingIndicator); - } - } - - [Fact] - public void IndicateMissingValuesOnnxConversionTest() - { - var mlContext = new MLContext(seed: 1); - - var samples = new List() - { - new DataPoint() { Features = new float[3] {1, 1, 0}, }, - new DataPoint() { Features = new float[3] {0, float.NaN, 1}, }, - new DataPoint() { Features = new float[3] {-1, float.NaN, float.PositiveInfinity}, }, - }; - var dataView = mlContext.Data.LoadFromEnumerable(samples); - - // IsNaN outputs a binary tensor. Support for this has been added in the latest version - // of Onnxruntime, but that hasn't been released yet. - // So we need to convert its type to Int32 until then. - // ConvertType part of the pipeline can be removed once we pick up a new release of the Onnx runtime - - var pipeline = mlContext.Transforms.IndicateMissingValues(new[] { new InputOutputColumnPair("MissingIndicator", "Features"), }) - .Append(mlContext.Transforms.Conversion.ConvertType("MissingIndicator", outputKind: DataKind.Int32)); - - var model = pipeline.Fit(dataView); - var transformedData = model.Transform(dataView); - var mlnetData = mlContext.Data.CreateEnumerable(transformedData, false); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Transforms"); - var onnxFileName = "IndicateMissingValues.onnx"; - var onnxTextName = "IndicateMissingValues.txt"; - var onnxModelPath = GetOutputPath(onnxFileName); - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - - SaveOnnxModel(onnxModel, onnxModelPath, onnxTextPath); - - // Compare results produced by ML.NET and ONNX's runtime. - if (IsOnnxRuntimeSupported()) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedVectorColumns(model.LastTransformer.ColumnPairs[0].outputColumnName, outputNames[1], transformedData, onnxResult); - } - - CheckEquality(subDir, onnxTextName, parseOption: NumberParseOption.UseSingle); - Done(); - } - - [Theory] - [InlineData(DataKind.Single)] - [InlineData(DataKind.String)] - public void ValueToKeyMappingOnnxConversionTest(DataKind valueType) - { - var mlContext = new MLContext(seed: 1); - string filePath = GetDataPath("type-conversion.txt"); - - TextLoader.Column[] columns = new[] - { - new TextLoader.Column("Value", valueType, 0, 0) - }; - var dataView = mlContext.Data.LoadFromTextFile(filePath, columns); - - var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Key", "Value"); - var model = pipeline.Fit(dataView); - var mlnetResult = model.Transform(dataView); - - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - var onnxFileName = "ValueToKey.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); - - if (IsOnnxRuntimeSupported()) - { - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - - CompareSelectedVectorColumns(model.ColumnPairs[0].outputColumnName, outputNames[1], mlnetResult, onnxResult); - } - - Done(); - } - - private class TextData - { - public string Text { get; set; } - } - - [Fact] - public void WordTokenizerOnnxConversionTest() - { - var mlContext = new MLContext(seed: 1); - - var samples = new List() - { - new TextData(){ Text = "cat sat on mat" }, - new TextData(){ Text = "mat not fit cat" }, - new TextData(){ Text = "cat think mat bad" }, - }; - - var dataView = mlContext.Data.LoadFromEnumerable(samples); - - var pipe = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text", new[] { ' ' }); - - var model = pipe.Fit(dataView); - var transformedData = model.Transform(dataView); - - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - var onnxFilename = "Tokenizer.onnx"; - var onnxFilePath = GetOutputPath(onnxFilename); - SaveOnnxModel(onnxModel, onnxFilePath, null); - if (IsOnnxRuntimeSupported()) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxFilePath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedVectorColumns>(transformedData.Schema[1].Name, outputNames[1], transformedData, onnxResult); - } - - Done(); - } - - [Theory] - [CombinatorialData] - public void NgramOnnxConnversionTest( - [CombinatorialValues(1, 2, 3)] int ngramLength, - bool useAllLength, - NgramExtractingEstimator.WeightingCriteria weighting) - { - var mlContext = new MLContext(seed: 1); - - var samples = new List() - { - new TextData(){ Text = "cat sat on mat" }, - new TextData(){ Text = "mat not fit cat" }, - new TextData(){ Text = "cat think mat bad" }, - }; - - // Convert training data to IDataView. - var dataView = mlContext.Data.LoadFromEnumerable(samples); - - IEstimator[] pipelines = - { - mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text", new[] { ' ' }) - .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) - .Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens", - ngramLength: ngramLength, - useAllLengths: useAllLength, - weighting: weighting)), - - mlContext.Transforms.Text.ProduceWordBags("Tokens", "Text", - ngramLength: ngramLength, - useAllLengths: useAllLength, - weighting: weighting) - }; - - for (int i = 0; i < pipelines.Length; i++) - { - var pipe = pipelines[i]; - var model = pipe.Fit(dataView); - var transformedData = model.Transform(dataView); - - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - var onnxFilename = $"Ngram-{i}-{ngramLength}-{useAllLength}-{weighting}.onnx"; - var onnxFilePath = GetOutputPath(onnxFilename); - SaveOnnxModel(onnxModel, onnxFilePath, null); - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxFilePath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedR4VectorColumns(transformedData.Schema[3].Name, outputNames[outputNames.Length-1], transformedData, onnxResult, 3); - } - } - - Done(); - } - - [Fact] - public void OptionalColumnOnnxTest() - { - var mlContext = new MLContext(seed: 1); - - var samples = new List() - { - new BreastCancerCatFeatureExample() { Label = false, F1 = 0.0f, F2 = "F2"}, - new BreastCancerCatFeatureExample() { Label = true, F1 = 0.1f, F2 = "F2"}, - }; - IHostEnvironment env = mlContext as IHostEnvironment; - var dataView = mlContext.Data.LoadFromEnumerable(samples); - var args = new OptionalColumnTransform.Arguments { Columns = new[] { "F1" }, Data = dataView }; - var transform = OptionalColumnTransform.MakeOptional(env, args); - - var ctx = new OnnxContextImpl(mlContext, "model", "ML.NET", "0", 0, "machinelearning.dotnet", OnnxVersion.Stable); - var outputData = transform.OutputData; - LinkedList transforms = null; - ModelProto onnxModel; - using (var ch = env.Start("ONNX conversion")) - { - SaveOnnxCommand.GetPipe(ctx, ch, outputData, out IDataView root, out IDataView sink, out transforms); - onnxModel = SaveOnnxCommand.ConvertTransformListToOnnxModel(ctx, ch, root, sink, transforms, null, null); - } - - var onnxFileName = "optionalcol.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - var onnxTextFileName = "optionalcol.txt"; - var onnxTextPath = GetOutputPath(onnxTextFileName); - - SaveOnnxModel(onnxModel, onnxModelPath, onnxTextPath); - if (IsOnnxRuntimeSupported()) - { - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedR4ScalarColumns(transform.Model.OutputSchema[2].Name, outputNames[1], outputData, onnxResult); - } - Done(); - } - - [Fact] - public void KeyToValueOnnxConversionTest() - { - var mlContext = new MLContext(seed: 1); - - string dataPath = GetDataPath("breast-cancer.txt"); - var dataView = mlContext.Data.LoadFromTextFile(dataPath, - separatorChar: '\t', - hasHeader: true); - - var pipeline = mlContext.Transforms.Conversion.MapValueToKey("LabelKey", "Label"). - Append(mlContext.Transforms.Conversion.MapKeyToValue("LabelValue", "LabelKey")); - - var model = pipeline.Fit(dataView); - var transformedData = model.Transform(dataView); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - - var onnxFileName = "KeyToValue.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - - SaveOnnxModel(onnxModel, onnxModelPath, null); - - if (IsOnnxRuntimeSupported()) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedScalarColumns>(transformedData.Schema[3].Name, outputNames[3], transformedData, onnxResult); - } - - Done(); - } - - [Fact] - public void MulticlassTrainersOnnxConversionTest() - { - var mlContext = new MLContext(seed: 1); - - string dataPath = GetDataPath("breast-cancer.txt"); - var dataView = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: '\t', hasHeader: true); - - List> estimators = new List>() - { - mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(), - mlContext.MulticlassClassification.Trainers.NaiveBayes(), - mlContext.MulticlassClassification.Trainers.OneVersusAll( - mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(), useProbabilities:false), - mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy(), - mlContext.MulticlassClassification.Trainers.SdcaNonCalibrated() - }; - - if (Environment.Is64BitProcess) - { - estimators.Add(mlContext.MulticlassClassification.Trainers.LightGbm()); - estimators.Add(mlContext.MulticlassClassification.Trainers.LightGbm( - new LightGbmMulticlassTrainer.Options { UseSoftmax = true })); - } - - var initialPipeline = mlContext.Transforms.ReplaceMissingValues("Features") - .Append(mlContext.Transforms.NormalizeMinMax("Features")) - .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")); - - foreach (var estimator in estimators) - { - var pipeline = initialPipeline.Append(estimator); - var model = pipeline.Fit(dataView); - var transformedData = model.Transform(dataView); - - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - var onnxFileName = $"{estimator.ToString()}.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - - SaveOnnxModel(onnxModel, onnxModelPath, null); - - // Compare results produced by ML.NET and ONNX's runtime. - if (IsOnnxRuntimeSupported()) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedScalarColumns(transformedData.Schema[5].Name, outputNames[2], transformedData, onnxResult); - } - } - Done(); - } - - [Fact] - public void CopyColumnsOnnxTest() - { - var mlContext = new MLContext(seed: 1); - - var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataView = mlContext.Data.LoadFromTextFile(trainDataPath, - separatorChar: ';', - hasHeader: true); - - var pipeline = mlContext.Transforms.CopyColumns("Target1", "Target"); - var model = pipeline.Fit(dataView); - var transformedData = model.Transform(dataView); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - - var onnxFileName = "copycolumns.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - - SaveOnnxModel(onnxModel, onnxModelPath, null); - - if (IsOnnxRuntimeSupported()) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedR4ScalarColumns(model.ColumnPairs[0].outputColumnName, outputNames[2], transformedData, onnxResult); - } - Done(); - } - -<<<<<<< HEAD - [Fact] - public void FeatureSelectionOnnxTest() - { - var mlContext = new MLContext(seed: 1); - - string dataPath = GetDataPath("breast-cancer.txt"); - - var dataView = ML.Data.LoadFromTextFile(dataPath, new[] { - new TextLoader.Column("ScalarFloat", DataKind.Single, 6), - new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4), - new TextLoader.Column("VectorDouble", DataKind.Double, 4, 8), - new TextLoader.Column("Label", DataKind.Boolean, 0) - }); - - var columns = new[] { - new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", count: 1), - new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", count: 690), - new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", count: 100), - new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", count: 690), - new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", count: 100) - }; - var pipeline = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("FeatureSelect", "VectorFloat", count: 1) - .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(columns)) - .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelectMIScalarFloat", "ScalarFloat")) - .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelectMIVectorFloat", "VectorFloat")); - - var model = pipeline.Fit(dataView); - var transformedData = model.Transform(dataView); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - - var onnxFileName = "countfeatures.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - - SaveOnnxModel(onnxModel, onnxModelPath, null); - - if (IsOnnxRuntimeSupported()) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedR4ScalarColumns("FeatureSelectMIScalarFloat", "FeatureSelectMIScalarFloat0", transformedData, onnxResult); - CompareSelectedR4VectorColumns("FeatureSelectMIVectorFloat", "FeatureSelectMIVectorFloat0", transformedData, onnxResult); - CompareSelectedR4ScalarColumns("ScalFeatureSelectMissing690", "ScalFeatureSelectMissing6900", transformedData, onnxResult); - CompareSelectedR8VectorColumns("VecFeatureSelectMissing690", "VecFeatureSelectMissing6900", transformedData, onnxResult); - } - Done(); - } - - -||||||| constructed merge base -======= - [Fact] - public void SelectColumnsOnnxTest() - { - var mlContext = new MLContext(seed: 1); - - string dataPath = GetDataPath("breast-cancer.txt"); - - var dataView = ML.Data.LoadFromTextFile(dataPath, new[] { - new TextLoader.Column("Label", DataKind.Boolean, 0), - new TextLoader.Column("Thickness", DataKind.Int32, 1), - new TextLoader.Column("Size", DataKind.Int32, 2), - new TextLoader.Column("Shape", DataKind.Int32, 3), - new TextLoader.Column("Adhesion", DataKind.Int32, 4), - new TextLoader.Column("EpithelialSize", DataKind.Int32, 5), - new TextLoader.Column("BareNuclei", DataKind.Single, 6), - new TextLoader.Column("BlandChromatin", DataKind.Int32, 7), - new TextLoader.Column("NormalNucleoli", DataKind.Int32, 8), - new TextLoader.Column("Mitoses", DataKind.Int32, 9), - }); - - var pipeline = mlContext.Transforms.ReplaceMissingValues("BareNuclei") - .Append(mlContext.Transforms.SelectColumns(new[] { "Size", "Shape", "Thickness", "Label" })); - - var model = pipeline.Fit(dataView); - var transformedData = model.Transform(dataView); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - - var onnxFileName = "selectcolumns.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - - SaveOnnxModel(onnxModel, onnxModelPath, null); - - if (IsOnnxRuntimeSupported()) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - - Assert.Equal("Size1", outputNames[0]); - Assert.Equal("Shape1", outputNames[1]); - Assert.Equal("Thickness1", outputNames[2]); - Assert.Equal("Label1", outputNames[3]); - - CompareSelectedScalarColumns("Size", "Size1", transformedData, onnxResult); - CompareSelectedScalarColumns("Shape", "Shape1", transformedData, onnxResult); - CompareSelectedScalarColumns("Thickness", "Thickness1", transformedData, onnxResult); - CompareSelectedScalarColumns("Label", "Label1", transformedData, onnxResult); - } - Done(); - } - ->>>>>>> Added onnx export support for SelectColumns - private void CompareResults(string leftColumnName, string rightColumnName, IDataView left, IDataView right) - { - var leftColumn = left.Schema[leftColumnName]; - var rightColumn = right.Schema[rightColumnName]; - var leftType = leftColumn.Type.GetItemType(); - var rightType = rightColumn.Type.GetItemType(); - Assert.Equal(leftType, rightType); - - if (leftType == NumberDataViewType.SByte) - CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); - else if (leftType == NumberDataViewType.Byte) - CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); - else if (leftType == NumberDataViewType.Int16) - CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); - else if (leftType == NumberDataViewType.UInt16) - CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); - else if (leftType == NumberDataViewType.Int32) - CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); - else if (leftType == NumberDataViewType.UInt32) - CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); - else if (leftType == NumberDataViewType.Int64) - CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); - else if (leftType == NumberDataViewType.UInt64) - CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); - else if (leftType == NumberDataViewType.Single) - CompareSelectedR4VectorColumns(leftColumnName, rightColumnName, left, right); - else if (leftType == NumberDataViewType.Double) - CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); - } - - private void CompareSelectedVectorColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) - { - var leftColumn = left.Schema[leftColumnName]; - var rightColumn = right.Schema[rightColumnName]; - - using (var expectedCursor = left.GetRowCursor(leftColumn)) - using (var actualCursor = right.GetRowCursor(rightColumn)) - { - VBuffer expected = default; - VBuffer actual = default; - var expectedGetter = expectedCursor.GetGetter>(leftColumn); - var actualGetter = actualCursor.GetGetter>(rightColumn); - while (expectedCursor.MoveNext() && actualCursor.MoveNext()) - { - expectedGetter(ref expected); - actualGetter(ref actual); - - Assert.Equal(expected.Length, actual.Length); - for (int i = 0; i < expected.Length; ++i) - if (typeof(T) == typeof(ReadOnlyMemory)) - Assert.Equal(expected.GetItemOrDefault(i).ToString(), actual.GetItemOrDefault(i).ToString()); - else - Assert.Equal(expected.GetItemOrDefault(i), actual.GetItemOrDefault(i)); - } - } - } - - private void CompareSelectedR8VectorColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right, int precision = 6) - { - var leftColumn = left.Schema[leftColumnName]; - var rightColumn = right.Schema[rightColumnName]; - - using (var expectedCursor = left.GetRowCursor(leftColumn)) - using (var actualCursor = right.GetRowCursor(rightColumn)) - { - VBuffer expected = default; - VBuffer actual = default; - var expectedGetter = expectedCursor.GetGetter>(leftColumn); - var actualGetter = actualCursor.GetGetter>(rightColumn); - while (expectedCursor.MoveNext() && actualCursor.MoveNext()) - { - expectedGetter(ref expected); - actualGetter(ref actual); - - Assert.Equal(expected.Length, actual.Length); - for (int i = 0; i < expected.Length; ++i) - Assert.Equal(expected.GetItemOrDefault(i), actual.GetItemOrDefault(i), precision); - } - } - } - - private void CompareSelectedR4VectorColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right, int precision = 6) - { - var leftColumn = left.Schema[leftColumnName]; - var rightColumn = right.Schema[rightColumnName]; - - using (var expectedCursor = left.GetRowCursor(leftColumn)) - using (var actualCursor = right.GetRowCursor(rightColumn)) - { - VBuffer expected = default; - VBuffer actual = default; - var expectedGetter = expectedCursor.GetGetter>(leftColumn); - var actualGetter = actualCursor.GetGetter>(rightColumn); - while (expectedCursor.MoveNext() && actualCursor.MoveNext()) - { - expectedGetter(ref expected); - actualGetter(ref actual); - - Assert.Equal(expected.Length, actual.Length); - for (int i = 0; i < expected.Length; ++i) - { - // We are using float values. But the Assert.Equal function takes doubles. - // And sometimes the converted doubles are different in their precision. - // So make sure we compare floats - float exp = expected.GetItemOrDefault(i); - float act = actual.GetItemOrDefault(i); - CompareNumbersWithTolerance(exp, act, null, precision); - } - } - } - } - - private void CompareSelectedR4ScalarColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right, int precision = 6) - { - var leftColumn = left.Schema[leftColumnName]; - var rightColumn = right.Schema[rightColumnName]; - - using (var expectedCursor = left.GetRowCursor(leftColumn)) - using (var actualCursor = right.GetRowCursor(rightColumn)) - { - float expected = default; - VBuffer actual = default; - var expectedGetter = expectedCursor.GetGetter(leftColumn); - var actualGetter = actualCursor.GetGetter>(rightColumn); - while (expectedCursor.MoveNext() && actualCursor.MoveNext()) - { - expectedGetter(ref expected); - actualGetter(ref actual); - - // Scalar such as R4 (float) is converted to [1, 1]-tensor in ONNX format for consitency of making batch prediction. - Assert.Equal(1, actual.Length); - CompareNumbersWithTolerance(expected, actual.GetItemOrDefault(0), null, precision); - } - } - } - - private void CompareSelectedScalarColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) - { - var leftColumn = left.Schema[leftColumnName]; - var rightColumn = right.Schema[rightColumnName]; - - using (var expectedCursor = left.GetRowCursor(leftColumn)) - using (var actualCursor = right.GetRowCursor(rightColumn)) - { - T expected = default; - VBuffer actual = default; - var expectedGetter = expectedCursor.GetGetter(leftColumn); - var actualGetter = actualCursor.GetGetter>(rightColumn); - while (expectedCursor.MoveNext() && actualCursor.MoveNext()) - { - expectedGetter(ref expected); - actualGetter(ref actual); - var actualVal = actual.GetItemOrDefault(0); - - Assert.Equal(1, actual.Length); - - if (typeof(T) == typeof(ReadOnlyMemory)) - Assert.Equal(expected.ToString(), actualVal.ToString()); - else - Assert.Equal(expected, actualVal); - } - } - } - - private void SaveOnnxModel(ModelProto model, string binaryFormatPath, string textFormatPath) - { - DeleteOutputPath(binaryFormatPath); // Clean if such a file exists. - DeleteOutputPath(textFormatPath); - - if (binaryFormatPath != null) - using (var file = Env.CreateOutputFile(binaryFormatPath)) - using (var stream = file.CreateWriteStream()) - model.WriteTo(stream); - - if (textFormatPath != null) - { - using (var file = Env.CreateOutputFile(textFormatPath)) - using (var stream = file.CreateWriteStream()) - using (var writer = new StreamWriter(stream)) - { - var parsedJson = JsonConvert.DeserializeObject(model.ToString()); - writer.Write(JsonConvert.SerializeObject(parsedJson, Formatting.Indented)); - } - - // Strip the version information. - var fileText = File.ReadAllText(textFormatPath); - - fileText = Regex.Replace(fileText, "\"producerVersion\": \".*\"", "\"producerVersion\": \"##VERSION##\""); - File.WriteAllText(textFormatPath, fileText); - } - } - } -} From f773b0e4daa3cea9d5cc9dfa7d4b9dd3e107d319 Mon Sep 17 00:00:00 2001 From: "Harish S. Kulkarni" Date: Wed, 18 Dec 2019 11:41:59 -0800 Subject: [PATCH 3/5] Added doc entry for onnx --- src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs b/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs index d2cd9b0e9d..78dcbbc6d1 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs @@ -44,6 +44,7 @@ namespace Microsoft.ML.Transforms /// | -- | -- | /// | Does this estimator need to look at the data to train its parameters? | No | /// | Input columns data type | Any | + /// | Exportable to ONNX | Yes | /// /// The resulting /// operates on the schema of a given by dropping or keeping selected columns from the schema. From 1b6cb4f142553606b62bcf6520981c34aca2906b Mon Sep 17 00:00:00 2001 From: "Harish S. Kulkarni" Date: Mon, 13 Jan 2020 11:05:53 -0800 Subject: [PATCH 4/5] Added custom data transform for OnnxTransform to fix output schema format --- .../Transforms/RowToRowTransformerBase.cs | 8 +- .../OnnxTransform.cs | 203 ++++++++++- .../Common/Onnx/Transforms/SelectColumns.txt | 338 ++++++++++++++++++ test/Microsoft.ML.Tests/OnnxConversionTest.cs | 13 +- 4 files changed, 552 insertions(+), 10 deletions(-) create mode 100644 test/BaselineOutput/Common/Onnx/Transforms/SelectColumns.txt diff --git a/src/Microsoft.ML.Data/Transforms/RowToRowTransformerBase.cs b/src/Microsoft.ML.Data/Transforms/RowToRowTransformerBase.cs index baf3400eaf..f2f1f6e3cf 100644 --- a/src/Microsoft.ML.Data/Transforms/RowToRowTransformerBase.cs +++ b/src/Microsoft.ML.Data/Transforms/RowToRowTransformerBase.cs @@ -37,14 +37,18 @@ IRowToRowMapper ITransformer.GetRowToRowMapper(DataViewSchema inputSchema) [BestFriend] private protected abstract IRowMapper MakeRowMapper(DataViewSchema schema); - public DataViewSchema GetOutputSchema(DataViewSchema inputSchema) + public DataViewSchema GetOutputSchema(DataViewSchema inputSchema) => GetOutputSchemaCore(inputSchema); + + protected virtual DataViewSchema GetOutputSchemaCore(DataViewSchema inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); var mapper = MakeRowMapper(inputSchema); return RowToRowMapperTransform.GetOutputSchema(inputSchema, mapper); } - public IDataView Transform(IDataView input) => MakeDataTransform(input); + public IDataView Transform(IDataView input) => MakeDataTransformCore(input); + + private protected virtual IDataView MakeDataTransformCore(IDataView input) => MakeDataTransform(input); [BestFriend] private protected RowToRowMapperTransform MakeDataTransform(IDataView input) diff --git a/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs b/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs index 65efbaf419..e021611df2 100644 --- a/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs +++ b/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs @@ -6,7 +6,6 @@ using System.Collections.Generic; using System.IO; using System.Linq; -using System.Text; using Microsoft.ML; using Microsoft.ML.CommandLine; using Microsoft.ML.Data; @@ -117,6 +116,8 @@ internal sealed class Options : TransformInputBase /// internal DataViewType[] OutputTypes { get; } + public readonly DataViewSchema OutputSchema; + private static VersionInfo GetVersionInfo() { return new VersionInfo( @@ -133,12 +134,18 @@ private static VersionInfo GetVersionInfo() // Factory method for SignatureDataTransform private static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { - return new OnnxTransformer(env, options).MakeDataTransform(input); + var transformer = new OnnxTransformer(env, options); + var mapper = new Mapper(transformer, input.Schema); + return new OnnxDataTransform(env, input, mapper); } // Factory method for SignatureLoadDataTransform private static IDataTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) - => Create(env, ctx).MakeDataTransform(input); + { + var transformer = OnnxTransformer.Create(env, ctx); + var mapper = new Mapper(transformer, input.Schema); + return new OnnxDataTransform(env, input, mapper); + } // Factory method for SignatureLoadModel. private static OnnxTransformer Create(IHostEnvironment env, ModelLoadContext ctx) @@ -187,8 +194,7 @@ private static OnnxTransformer Create(IHostEnvironment env, ModelLoadContext ctx } // Factory method for SignatureLoadRowMapper. - private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, DataViewSchema inputSchema) - => Create(env, ctx).MakeRowMapper(inputSchema); + private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, DataViewSchema inputSchema) => new Mapper(Create(env, ctx), inputSchema); private OnnxTransformer(IHostEnvironment env, Options options, byte[] modelBytes = null) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(OnnxTransformer))) @@ -243,6 +249,12 @@ private OnnxTransformer(IHostEnvironment env, Options options, byte[] modelBytes OutputTypes[i] = outputInfo.DataViewType; } _options = options; + + var schemaBuilder = new DataViewSchema.Builder(); + for (var i = 0; i < Outputs.Length; i++) + schemaBuilder.AddColumn(Outputs[i], OutputTypes[i]); + + OutputSchema = schemaBuilder.ToSchema(); } /// @@ -326,6 +338,14 @@ private protected override void SaveModel(ModelSaveContext ctx) private protected override IRowMapper MakeRowMapper(DataViewSchema inputSchema) => new Mapper(this, inputSchema); + protected override DataViewSchema GetOutputSchemaCore(DataViewSchema inputSchema) => OutputSchema; + + private protected override IDataView MakeDataTransformCore(IDataView input) + { + Host.CheckValue(input, nameof(input)); + return new OnnxDataTransform(Host, input, new Mapper(this, input.Schema)); + } + /// /// This design assumes that all unknown dimensions are 1s. It also convert scalar shape [] in ONNX to [1]. /// [TODO] We should infer the unknown shape from input data instead of forcing them to be 1. @@ -356,6 +376,8 @@ private sealed class Mapper : MapperBase /// private readonly Type[] _inputOnnxTypes; + public DataViewSchema OutputSchema => _parent.GetOutputSchema(InputSchema); + public Mapper(OnnxTransformer parent, DataViewSchema inputSchema) : base(Contracts.CheckRef(parent, nameof(parent)).Host.Register(nameof(Mapper)), inputSchema, parent) { @@ -401,6 +423,8 @@ public Mapper(OnnxTransformer parent, DataViewSchema inputSchema) : } } + public DataViewSchema.DetachedColumn[] GetOutputColumns() => GetOutputColumnsCore(); + protected override DataViewSchema.DetachedColumn[] GetOutputColumnsCore() { var info = new DataViewSchema.DetachedColumn[_parent.Outputs.Length]; @@ -409,6 +433,8 @@ protected override DataViewSchema.DetachedColumn[] GetOutputColumnsCore() return info; } + public Func GetDependencies(Func activeOutput) => GetDependenciesCore(activeOutput); + private protected override Func GetDependenciesCore(Func activeOutput) { return col => Enumerable.Range(0, _parent.Outputs.Length).Any(i => activeOutput(i)) && _inputColIndices.Any(i => i == col); @@ -646,6 +672,173 @@ public NamedOnnxValue GetNamedOnnxValue() } } } + + private class OnnxDataTransform : TransformBase, IRowToRowMapper + { + private readonly Mapper _mapper; + private readonly IRowMapper _mapperIf; + + public OnnxDataTransform(IHostEnvironment env, IDataView input, Mapper mapper) + :base(env.Register(nameof(OnnxDataTransform)), input) + { + _mapper = mapper; + _mapperIf = mapper as IRowMapper; + } + + public DataViewSchema Schema => OutputSchema; + + public DataViewSchema InputSchema => Source.Schema; + + public override DataViewSchema OutputSchema => _mapper.OutputSchema; + + public override long? GetRowCount() => Source.GetRowCount(); + + public void Save(ModelSaveContext ctx) => _mapperIf.Save(ctx); + + public IEnumerable GetDependencies(IEnumerable dependingColumns) + { + return Source.Schema; + } + + public DataViewRow GetRow(DataViewRow input, IEnumerable activeColumns) + { + Host.CheckValue(input, nameof(input)); + Host.CheckValue(activeColumns, nameof(activeColumns)); + Host.Check(input.Schema == Source.Schema, "Schema of input row must be the same as the schema the mapper is bound to"); + + using (var ch = Host.Start("GetEntireRow")) + { + var pred = RowCursorUtils.FromColumnsToPredicate(activeColumns, Schema); + var getters = _mapperIf.CreateGetters(input, pred, out Action disp); + return new RowImpl(input, this, Schema, getters, disp); + } + } + + protected override DataViewRowCursor GetRowCursorCore(IEnumerable columnsNeeded, Random rand = null) + { + var predicate = RowCursorUtils.FromColumnsToPredicate(columnsNeeded, Schema); + var active = Utils.BuildArray(Schema.Count, predicate); + return new Cursor(Host, Source.GetRowCursor(Source.Schema, rand), this, active); + } + + public override DataViewRowCursor[] GetRowCursorSet(IEnumerable columnsNeeded, int n, Random rand = null) + { + Host.CheckValueOrNull(rand); + + var predicate = RowCursorUtils.FromColumnsToPredicate(columnsNeeded, OutputSchema); + var active = Utils.BuildArray(Schema.Count, predicate); + + var inputs = Source.GetRowCursorSet(Source.Schema, n, rand); + Host.AssertNonEmpty(inputs); + + if (inputs.Length == 1 && n > 1 && Enumerable.Range(0, Schema.Count).Any(predicate)) + inputs = DataViewUtils.CreateSplitCursors(Host, inputs[0], n); + Host.AssertNonEmpty(inputs); + + var cursors = new DataViewRowCursor[inputs.Length]; + for (int i = 0; i < inputs.Length; i++) + cursors[i] = new Cursor(Host, inputs[i], this, active); + return cursors; + } + + private protected override void SaveModel(ModelSaveContext ctx) => _mapperIf.Save(ctx); + + protected override bool? ShouldUseParallelCursors(Func predicate) + { + return true; + } + + private sealed class RowImpl : WrappingRow + { + private readonly Delegate[] _getters; + private readonly OnnxDataTransform _parent; + private readonly Action _disposer; + + public override DataViewSchema Schema { get; } + + public RowImpl(DataViewRow input, OnnxDataTransform parent, DataViewSchema schema, Delegate[] getters, Action disposer) + : base(input) + { + _parent = parent; + Schema = schema; + _getters = getters; + _disposer = disposer; + } + + protected override void DisposeCore(bool disposing) + { + if (disposing) + _disposer?.Invoke(); + } + + /// + /// Returns a value getter delegate to fetch the value of column with the given columnIndex, from the row. + /// This throws if the column is not active in this row, or if the type + /// differs from this column's type. + /// + /// is the column's content type. + /// is the output column whose getter should be returned. + public override ValueGetter GetGetter(DataViewSchema.Column column) + { + int index = column.Index; + Contracts.Assert(_getters[index] != null); + var fn = _getters[index] as ValueGetter; + if (fn == null) + throw Contracts.Except("Invalid TValue in GetGetter: '{0}'", typeof(TValue)); + return fn; + } + + /// + /// Returns whether the given column is active in this row. + /// + public override bool IsColumnActive(DataViewSchema.Column column) + { + return _getters[column.Index] != null; + } + } + + private sealed class Cursor : SynchronizedCursorBase + { + private readonly OnnxDataTransform _parent; + private readonly Delegate[] _getters; + private readonly bool[] _active; + private readonly Action _disposer; + private bool _disposed; + + public Cursor(IChannelProvider provider, DataViewRowCursor input, OnnxDataTransform parent, bool[] active) + : base(provider, input) + { + _parent = parent; + Func pred = c => active[c]; + _getters = parent._mapperIf.CreateGetters(input, pred, out _disposer); + _active = active; + } + + public override DataViewSchema Schema => _parent._mapper.OutputSchema; + + public override ValueGetter GetGetter(DataViewSchema.Column column) + { + var getter = _getters[column.Index]; + Ch.Assert(getter != null); + var fn = getter as ValueGetter; + if (fn == null) + throw Ch.Except("Invalid TValue in GetGetter: '{0}'", typeof(TValue)); + return fn; + } + + protected override void Dispose(bool disposing) + { + if (_disposed) + return; + if (disposing) + _disposer?.Invoke(); + _disposed = true; + base.Dispose(disposing); + } + + public override bool IsColumnActive(DataViewSchema.Column column) => _active[column.Index]; + } + } } /// diff --git a/test/BaselineOutput/Common/Onnx/Transforms/SelectColumns.txt b/test/BaselineOutput/Common/Onnx/Transforms/SelectColumns.txt new file mode 100644 index 0000000000..5672bbc9da --- /dev/null +++ b/test/BaselineOutput/Common/Onnx/Transforms/SelectColumns.txt @@ -0,0 +1,338 @@ +{ + "irVersion": "6", + "producerName": "ML.NET", + "producerVersion": "##VERSION##", + "domain": "machinelearning.dotnet", + "graph": { + "node": [ + { + "input": [ + "Size" + ], + "output": [ + "Size0" + ], + "name": "Identity", + "opType": "Identity" + }, + { + "input": [ + "Shape" + ], + "output": [ + "Shape0" + ], + "name": "Identity0", + "opType": "Identity" + }, + { + "input": [ + "Thickness" + ], + "output": [ + "Thickness0" + ], + "name": "Identity1", + "opType": "Identity" + }, + { + "input": [ + "Label" + ], + "output": [ + "Label0" + ], + "name": "Identity2", + "opType": "Identity" + }, + { + "input": [ + "Size0" + ], + "output": [ + "Size1" + ], + "name": "Identity3", + "opType": "Identity" + }, + { + "input": [ + "Shape0" + ], + "output": [ + "Shape1" + ], + "name": "Identity4", + "opType": "Identity" + }, + { + "input": [ + "Thickness0" + ], + "output": [ + "Thickness1" + ], + "name": "Identity5", + "opType": "Identity" + }, + { + "input": [ + "Label0" + ], + "output": [ + "Label1" + ], + "name": "Identity6", + "opType": "Identity" + } + ], + "name": "model", + "input": [ + { + "name": "Label", + "type": { + "tensorType": { + "elemType": 9, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "Thickness", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "Size", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "Shape", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "Adhesion", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "EpithelialSize", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "BlandChromatin", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "NormalNucleoli", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "Mitoses", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + } + ], + "output": [ + { + "name": "Size1", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "Shape1", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "Thickness1", + "type": { + "tensorType": { + "elemType": 6, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, + { + "name": "Label1", + "type": { + "tensorType": { + "elemType": 9, + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "1" + } + ] + } + } + } + } + ] + }, + "opsetImport": [ + { + "domain": "ai.onnx.ml", + "version": "2" + }, + { + "version": "11" + } + ] +} \ No newline at end of file diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 5bd0db013a..1bdadba595 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1322,14 +1322,12 @@ public void SelectColumnsOnnxTest() new TextLoader.Column("Shape", DataKind.Int32, 3), new TextLoader.Column("Adhesion", DataKind.Int32, 4), new TextLoader.Column("EpithelialSize", DataKind.Int32, 5), - new TextLoader.Column("BareNuclei", DataKind.Single, 6), new TextLoader.Column("BlandChromatin", DataKind.Int32, 7), new TextLoader.Column("NormalNucleoli", DataKind.Int32, 8), new TextLoader.Column("Mitoses", DataKind.Int32, 9), }); - var pipeline = mlContext.Transforms.ReplaceMissingValues("BareNuclei") - .Append(mlContext.Transforms.SelectColumns(new[] { "Size", "Shape", "Thickness", "Label" })); + var pipeline = mlContext.Transforms.SelectColumns(new[] { "Size", "Shape", "Thickness", "Label" }); var model = pipeline.Fit(dataView); var transformedData = model.Transform(dataView); @@ -1349,6 +1347,8 @@ public void SelectColumnsOnnxTest() var onnxTransformer = onnxEstimator.Fit(dataView); var onnxResult = onnxTransformer.Transform(dataView); + // Verify that onnx output has only the four columns we selected from the input + Assert.Equal(4, outputNames.Length); Assert.Equal("Size1", outputNames[0]); Assert.Equal("Shape1", outputNames[1]); Assert.Equal("Thickness1", outputNames[2]); @@ -1359,6 +1359,13 @@ public void SelectColumnsOnnxTest() CompareSelectedScalarColumns("Thickness", "Thickness1", transformedData, onnxResult); CompareSelectedScalarColumns("Label", "Label1", transformedData, onnxResult); } + + onnxFileName = "SelectColumns.txt"; + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Transforms"); + var onnxTextModelPath = GetOutputPath(subDir, onnxFileName); + SaveOnnxModel(onnxModel, null, onnxTextModelPath); + CheckEquality(subDir, onnxFileName, digitsOfPrecision: 1); + Done(); } From 6cc16d9b624c55f02a24035619191dc2c321f959 Mon Sep 17 00:00:00 2001 From: "Harish S. Kulkarni" Date: Tue, 21 Jan 2020 12:45:13 -0800 Subject: [PATCH 5/5] Backed out earlier changes from RowToRowTransformerBase.cs and OnnxTransform.cs and updated OnnxContext to include a facility to remove input variables --- .../Model/Onnx/OnnxContext.cs | 6 + .../Transforms/ColumnSelecting.cs | 9 + .../Transforms/RowToRowTransformerBase.cs | 8 +- .../OnnxContextImpl.cs | 9 + .../OnnxTransform.cs | 203 +----------------- .../Common/Onnx/Transforms/SelectColumns.txt | 90 -------- 6 files changed, 31 insertions(+), 294 deletions(-) diff --git a/src/Microsoft.ML.Data/Model/Onnx/OnnxContext.cs b/src/Microsoft.ML.Data/Model/Onnx/OnnxContext.cs index 6183bb06c7..a6542c56a8 100644 --- a/src/Microsoft.ML.Data/Model/Onnx/OnnxContext.cs +++ b/src/Microsoft.ML.Data/Model/Onnx/OnnxContext.cs @@ -60,6 +60,12 @@ internal abstract class OnnxContext /// IDataView column to stop tracking public abstract void RemoveVariable(string variableName, bool removeColumn); + /// + /// Removes a variable from the input columns list. This function is used only by the ColumnSelectingTransformer. + /// + /// ONNX variable to remove. + public abstract void RemoveInputVariable(string variableName); + /// /// ONNX variables are referred to by name. At each stage of a ML.NET pipeline, the corresponding /// 's column names will map to a variable in the ONNX graph if the intermediate steps diff --git a/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs b/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs index 78dcbbc6d1..df3665fa90 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs @@ -732,6 +732,8 @@ IDataTransform ITransformTemplate.ApplyToData(IHostEnvironment env, IDataView ne public void SaveAsOnnx(OnnxContext ctx) { + var droppedCols = new HashSet(Enumerable.Range(0, InputSchema.Count)); + var outputToInputMap = _mapper.OutputToInputMap; for(int i = 0; i < outputToInputMap.Length; i++) { @@ -741,6 +743,13 @@ public void SaveAsOnnx(OnnxContext ctx) var dstVariable = ctx.AddIntermediateVariable(dstCol.Type, dstCol.Name, true); string opType = "Identity"; ctx.CreateNode(opType, srcVariable, dstVariable, ctx.GetNodeName(opType), ""); + + droppedCols.Remove(srcCol.Index); + } + + foreach (var srcCol in droppedCols) + { + ctx.RemoveInputVariable(InputSchema[srcCol].Name); } } } diff --git a/src/Microsoft.ML.Data/Transforms/RowToRowTransformerBase.cs b/src/Microsoft.ML.Data/Transforms/RowToRowTransformerBase.cs index f2f1f6e3cf..baf3400eaf 100644 --- a/src/Microsoft.ML.Data/Transforms/RowToRowTransformerBase.cs +++ b/src/Microsoft.ML.Data/Transforms/RowToRowTransformerBase.cs @@ -37,18 +37,14 @@ IRowToRowMapper ITransformer.GetRowToRowMapper(DataViewSchema inputSchema) [BestFriend] private protected abstract IRowMapper MakeRowMapper(DataViewSchema schema); - public DataViewSchema GetOutputSchema(DataViewSchema inputSchema) => GetOutputSchemaCore(inputSchema); - - protected virtual DataViewSchema GetOutputSchemaCore(DataViewSchema inputSchema) + public DataViewSchema GetOutputSchema(DataViewSchema inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); var mapper = MakeRowMapper(inputSchema); return RowToRowMapperTransform.GetOutputSchema(inputSchema, mapper); } - public IDataView Transform(IDataView input) => MakeDataTransformCore(input); - - private protected virtual IDataView MakeDataTransformCore(IDataView input) => MakeDataTransform(input); + public IDataView Transform(IDataView input) => MakeDataTransform(input); [BestFriend] private protected RowToRowMapperTransform MakeDataTransform(IDataView input) diff --git a/src/Microsoft.ML.OnnxConverter/OnnxContextImpl.cs b/src/Microsoft.ML.OnnxConverter/OnnxContextImpl.cs index 8105a81126..41e05a7053 100644 --- a/src/Microsoft.ML.OnnxConverter/OnnxContextImpl.cs +++ b/src/Microsoft.ML.OnnxConverter/OnnxContextImpl.cs @@ -247,6 +247,15 @@ public void AddInputVariable(DataViewType type, string colName) _inputs.Add(OnnxUtils.GetModelArgs(type, colName)); } + public override void RemoveInputVariable(string colName) + { + var variableName = TryGetVariableName(colName); + _host.CheckValue(variableName, nameof(variableName)); + + RemoveVariable(variableName, true); + _inputs.Remove(_inputs.Single(modelArg => modelArg.Name == variableName)); + } + /// /// Retrieve the shape of an ONNX variable. Returns null if no shape for the specified variable can be found. /// diff --git a/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs b/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs index e021611df2..65efbaf419 100644 --- a/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs +++ b/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs @@ -6,6 +6,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using System.Text; using Microsoft.ML; using Microsoft.ML.CommandLine; using Microsoft.ML.Data; @@ -116,8 +117,6 @@ internal sealed class Options : TransformInputBase /// internal DataViewType[] OutputTypes { get; } - public readonly DataViewSchema OutputSchema; - private static VersionInfo GetVersionInfo() { return new VersionInfo( @@ -134,18 +133,12 @@ private static VersionInfo GetVersionInfo() // Factory method for SignatureDataTransform private static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { - var transformer = new OnnxTransformer(env, options); - var mapper = new Mapper(transformer, input.Schema); - return new OnnxDataTransform(env, input, mapper); + return new OnnxTransformer(env, options).MakeDataTransform(input); } // Factory method for SignatureLoadDataTransform private static IDataTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) - { - var transformer = OnnxTransformer.Create(env, ctx); - var mapper = new Mapper(transformer, input.Schema); - return new OnnxDataTransform(env, input, mapper); - } + => Create(env, ctx).MakeDataTransform(input); // Factory method for SignatureLoadModel. private static OnnxTransformer Create(IHostEnvironment env, ModelLoadContext ctx) @@ -194,7 +187,8 @@ private static OnnxTransformer Create(IHostEnvironment env, ModelLoadContext ctx } // Factory method for SignatureLoadRowMapper. - private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, DataViewSchema inputSchema) => new Mapper(Create(env, ctx), inputSchema); + private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, DataViewSchema inputSchema) + => Create(env, ctx).MakeRowMapper(inputSchema); private OnnxTransformer(IHostEnvironment env, Options options, byte[] modelBytes = null) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(OnnxTransformer))) @@ -249,12 +243,6 @@ private OnnxTransformer(IHostEnvironment env, Options options, byte[] modelBytes OutputTypes[i] = outputInfo.DataViewType; } _options = options; - - var schemaBuilder = new DataViewSchema.Builder(); - for (var i = 0; i < Outputs.Length; i++) - schemaBuilder.AddColumn(Outputs[i], OutputTypes[i]); - - OutputSchema = schemaBuilder.ToSchema(); } /// @@ -338,14 +326,6 @@ private protected override void SaveModel(ModelSaveContext ctx) private protected override IRowMapper MakeRowMapper(DataViewSchema inputSchema) => new Mapper(this, inputSchema); - protected override DataViewSchema GetOutputSchemaCore(DataViewSchema inputSchema) => OutputSchema; - - private protected override IDataView MakeDataTransformCore(IDataView input) - { - Host.CheckValue(input, nameof(input)); - return new OnnxDataTransform(Host, input, new Mapper(this, input.Schema)); - } - /// /// This design assumes that all unknown dimensions are 1s. It also convert scalar shape [] in ONNX to [1]. /// [TODO] We should infer the unknown shape from input data instead of forcing them to be 1. @@ -376,8 +356,6 @@ private sealed class Mapper : MapperBase /// private readonly Type[] _inputOnnxTypes; - public DataViewSchema OutputSchema => _parent.GetOutputSchema(InputSchema); - public Mapper(OnnxTransformer parent, DataViewSchema inputSchema) : base(Contracts.CheckRef(parent, nameof(parent)).Host.Register(nameof(Mapper)), inputSchema, parent) { @@ -423,8 +401,6 @@ public Mapper(OnnxTransformer parent, DataViewSchema inputSchema) : } } - public DataViewSchema.DetachedColumn[] GetOutputColumns() => GetOutputColumnsCore(); - protected override DataViewSchema.DetachedColumn[] GetOutputColumnsCore() { var info = new DataViewSchema.DetachedColumn[_parent.Outputs.Length]; @@ -433,8 +409,6 @@ protected override DataViewSchema.DetachedColumn[] GetOutputColumnsCore() return info; } - public Func GetDependencies(Func activeOutput) => GetDependenciesCore(activeOutput); - private protected override Func GetDependenciesCore(Func activeOutput) { return col => Enumerable.Range(0, _parent.Outputs.Length).Any(i => activeOutput(i)) && _inputColIndices.Any(i => i == col); @@ -672,173 +646,6 @@ public NamedOnnxValue GetNamedOnnxValue() } } } - - private class OnnxDataTransform : TransformBase, IRowToRowMapper - { - private readonly Mapper _mapper; - private readonly IRowMapper _mapperIf; - - public OnnxDataTransform(IHostEnvironment env, IDataView input, Mapper mapper) - :base(env.Register(nameof(OnnxDataTransform)), input) - { - _mapper = mapper; - _mapperIf = mapper as IRowMapper; - } - - public DataViewSchema Schema => OutputSchema; - - public DataViewSchema InputSchema => Source.Schema; - - public override DataViewSchema OutputSchema => _mapper.OutputSchema; - - public override long? GetRowCount() => Source.GetRowCount(); - - public void Save(ModelSaveContext ctx) => _mapperIf.Save(ctx); - - public IEnumerable GetDependencies(IEnumerable dependingColumns) - { - return Source.Schema; - } - - public DataViewRow GetRow(DataViewRow input, IEnumerable activeColumns) - { - Host.CheckValue(input, nameof(input)); - Host.CheckValue(activeColumns, nameof(activeColumns)); - Host.Check(input.Schema == Source.Schema, "Schema of input row must be the same as the schema the mapper is bound to"); - - using (var ch = Host.Start("GetEntireRow")) - { - var pred = RowCursorUtils.FromColumnsToPredicate(activeColumns, Schema); - var getters = _mapperIf.CreateGetters(input, pred, out Action disp); - return new RowImpl(input, this, Schema, getters, disp); - } - } - - protected override DataViewRowCursor GetRowCursorCore(IEnumerable columnsNeeded, Random rand = null) - { - var predicate = RowCursorUtils.FromColumnsToPredicate(columnsNeeded, Schema); - var active = Utils.BuildArray(Schema.Count, predicate); - return new Cursor(Host, Source.GetRowCursor(Source.Schema, rand), this, active); - } - - public override DataViewRowCursor[] GetRowCursorSet(IEnumerable columnsNeeded, int n, Random rand = null) - { - Host.CheckValueOrNull(rand); - - var predicate = RowCursorUtils.FromColumnsToPredicate(columnsNeeded, OutputSchema); - var active = Utils.BuildArray(Schema.Count, predicate); - - var inputs = Source.GetRowCursorSet(Source.Schema, n, rand); - Host.AssertNonEmpty(inputs); - - if (inputs.Length == 1 && n > 1 && Enumerable.Range(0, Schema.Count).Any(predicate)) - inputs = DataViewUtils.CreateSplitCursors(Host, inputs[0], n); - Host.AssertNonEmpty(inputs); - - var cursors = new DataViewRowCursor[inputs.Length]; - for (int i = 0; i < inputs.Length; i++) - cursors[i] = new Cursor(Host, inputs[i], this, active); - return cursors; - } - - private protected override void SaveModel(ModelSaveContext ctx) => _mapperIf.Save(ctx); - - protected override bool? ShouldUseParallelCursors(Func predicate) - { - return true; - } - - private sealed class RowImpl : WrappingRow - { - private readonly Delegate[] _getters; - private readonly OnnxDataTransform _parent; - private readonly Action _disposer; - - public override DataViewSchema Schema { get; } - - public RowImpl(DataViewRow input, OnnxDataTransform parent, DataViewSchema schema, Delegate[] getters, Action disposer) - : base(input) - { - _parent = parent; - Schema = schema; - _getters = getters; - _disposer = disposer; - } - - protected override void DisposeCore(bool disposing) - { - if (disposing) - _disposer?.Invoke(); - } - - /// - /// Returns a value getter delegate to fetch the value of column with the given columnIndex, from the row. - /// This throws if the column is not active in this row, or if the type - /// differs from this column's type. - /// - /// is the column's content type. - /// is the output column whose getter should be returned. - public override ValueGetter GetGetter(DataViewSchema.Column column) - { - int index = column.Index; - Contracts.Assert(_getters[index] != null); - var fn = _getters[index] as ValueGetter; - if (fn == null) - throw Contracts.Except("Invalid TValue in GetGetter: '{0}'", typeof(TValue)); - return fn; - } - - /// - /// Returns whether the given column is active in this row. - /// - public override bool IsColumnActive(DataViewSchema.Column column) - { - return _getters[column.Index] != null; - } - } - - private sealed class Cursor : SynchronizedCursorBase - { - private readonly OnnxDataTransform _parent; - private readonly Delegate[] _getters; - private readonly bool[] _active; - private readonly Action _disposer; - private bool _disposed; - - public Cursor(IChannelProvider provider, DataViewRowCursor input, OnnxDataTransform parent, bool[] active) - : base(provider, input) - { - _parent = parent; - Func pred = c => active[c]; - _getters = parent._mapperIf.CreateGetters(input, pred, out _disposer); - _active = active; - } - - public override DataViewSchema Schema => _parent._mapper.OutputSchema; - - public override ValueGetter GetGetter(DataViewSchema.Column column) - { - var getter = _getters[column.Index]; - Ch.Assert(getter != null); - var fn = getter as ValueGetter; - if (fn == null) - throw Ch.Except("Invalid TValue in GetGetter: '{0}'", typeof(TValue)); - return fn; - } - - protected override void Dispose(bool disposing) - { - if (_disposed) - return; - if (disposing) - _disposer?.Invoke(); - _disposed = true; - base.Dispose(disposing); - } - - public override bool IsColumnActive(DataViewSchema.Column column) => _active[column.Index]; - } - } } /// diff --git a/test/BaselineOutput/Common/Onnx/Transforms/SelectColumns.txt b/test/BaselineOutput/Common/Onnx/Transforms/SelectColumns.txt index 5672bbc9da..636d1f4da9 100644 --- a/test/BaselineOutput/Common/Onnx/Transforms/SelectColumns.txt +++ b/test/BaselineOutput/Common/Onnx/Transforms/SelectColumns.txt @@ -159,96 +159,6 @@ } } } - }, - { - "name": "Adhesion", - "type": { - "tensorType": { - "elemType": 6, - "shape": { - "dim": [ - { - "dimValue": "1" - }, - { - "dimValue": "1" - } - ] - } - } - } - }, - { - "name": "EpithelialSize", - "type": { - "tensorType": { - "elemType": 6, - "shape": { - "dim": [ - { - "dimValue": "1" - }, - { - "dimValue": "1" - } - ] - } - } - } - }, - { - "name": "BlandChromatin", - "type": { - "tensorType": { - "elemType": 6, - "shape": { - "dim": [ - { - "dimValue": "1" - }, - { - "dimValue": "1" - } - ] - } - } - } - }, - { - "name": "NormalNucleoli", - "type": { - "tensorType": { - "elemType": 6, - "shape": { - "dim": [ - { - "dimValue": "1" - }, - { - "dimValue": "1" - } - ] - } - } - } - }, - { - "name": "Mitoses", - "type": { - "tensorType": { - "elemType": 6, - "shape": { - "dim": [ - { - "dimValue": "1" - }, - { - "dimValue": "1" - } - ] - } - } - } } ], "output": [