From fcfec4baa6e196013d9f7d1573f398ecfaad3278 Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Mon, 4 May 2020 16:39:18 -0700 Subject: [PATCH 01/10] adding vector test for key2val --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 43 ++++++++++++------- test/data/type-conversion.txt | 2 +- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index b0adedb6ed..6feb2ec633 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1372,11 +1372,20 @@ public void KeyToValueMappingOnnxConversionTest(DataKind valueType) var mlContext = new MLContext(seed: 1); string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") : GetDataPath("type-conversion.txt"); - TextLoader.Column[] columns = new[] + TextLoader.Column[] columnsVector = new[] { - new TextLoader.Column("Value", valueType, 0, 0) + new TextLoader.Column("Value", valueType, 0, 3) }; - var dataView = mlContext.Data.LoadFromTextFile(filePath, columns); + TextLoader.Column[] columnsScalar = new[] + { + new TextLoader.Column("Value", valueType, 0) + }; + + IDataView[] dataViews = { + mlContext.Data.LoadFromTextFile(filePath, columnsScalar, separatorChar: '\t'), //scalar + mlContext.Data.LoadFromTextFile(filePath, columnsVector , separatorChar: '\t') //vector + }; + IEstimator[] pipelines = { mlContext.Transforms.Conversion.MapValueToKey("Key", "Value"). @@ -1385,22 +1394,26 @@ public void KeyToValueMappingOnnxConversionTest(DataKind valueType) mlContext.Transforms.Conversion.MapValueToKey("Value"). Append(mlContext.Transforms.Conversion.MapKeyToValue("Value")) }; + for (int i = 0; i < pipelines.Length; i++) { - var model = pipelines[i].Fit(dataView); - var mlnetResult = model.Transform(dataView); + for (int j = 0; j < dataViews.Length; j++) + { + var model = pipelines[i].Fit(dataViews[i]); + var mlnetResult = model.Transform(dataViews[i]); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - var onnxFileName = "KeyToValue.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataViews[i]); + var onnxFileName = "KeyToValue.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); - if (IsOnnxRuntimeSupported()) - { - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareResults("Value", "Value", mlnetResult, onnxResult); + if (IsOnnxRuntimeSupported()) + { + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataViews[i]); + var onnxResult = onnxTransformer.Transform(dataViews[i]); + CompareResults("Value", "Value", mlnetResult, onnxResult); + } } } Done(); diff --git a/test/data/type-conversion.txt b/test/data/type-conversion.txt index e440e5c842..be64f93dd7 100644 --- a/test/data/type-conversion.txt +++ b/test/data/type-conversion.txt @@ -1 +1 @@ -3 \ No newline at end of file +3 23 32 4 \ No newline at end of file From 31bc4b9c77117e6c31aca5468562f31a15bd8571 Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Mon, 4 May 2020 17:06:20 -0700 Subject: [PATCH 02/10] fix for key2value --- src/Microsoft.ML.Data/Transforms/KeyToValue.cs | 7 ++++--- .../Transforms/ValueToKeyMappingTransformer.cs | 8 +++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/KeyToValue.cs b/src/Microsoft.ML.Data/Transforms/KeyToValue.cs index 8a256116c8..983f9605cd 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToValue.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToValue.cs @@ -505,7 +505,8 @@ public override bool SaveOnnx(OnnxContext ctx, string srcVariableName, string ds // Onnx expects the input keys to be int64s. But the input data can come from an ML.NET node that // may output a uint32. So cast it here to ensure that the data is treated correctly opType = "Cast"; - var castNodeOutput = ctx.AddIntermediateVariable(NumberDataViewType.Int64, "CastNodeOutput"); + var srcShape = (int)ctx.RetrieveShapeOrNull(srcVariableName)[1]; + var castNodeOutput = ctx.AddIntermediateVariable(new VectorDataViewType(NumberDataViewType.Int64, srcShape), "CastNodeOutput"); var castNode = ctx.CreateNode(opType, srcVariableName, castNodeOutput, ctx.GetNodeName(opType), ""); var t = InternalDataKindExtensions.ToInternalDataKind(DataKind.Int64).ToType(); castNode.AddAttribute("to", t); @@ -513,11 +514,11 @@ public override bool SaveOnnx(OnnxContext ctx, string srcVariableName, string ds var labelEncoderOutput = dstVariableName; var labelEncoderInput = srcVariableName; if (TypeOutput == NumberDataViewType.Double || TypeOutput == BooleanDataViewType.Instance) - labelEncoderOutput = ctx.AddIntermediateVariable(NumberDataViewType.Single, "CastNodeOutput"); + labelEncoderOutput = ctx.AddIntermediateVariable(new VectorDataViewType(NumberDataViewType.Single, srcShape), "CastNodeOutput"); else if (TypeOutput == NumberDataViewType.Int64 || TypeOutput == NumberDataViewType.UInt16 || TypeOutput == NumberDataViewType.Int32 || TypeOutput == NumberDataViewType.Int16 || TypeOutput == NumberDataViewType.UInt64 || TypeOutput == NumberDataViewType.UInt32) - labelEncoderOutput = ctx.AddIntermediateVariable(TextDataViewType.Instance, "CastNodeOutput"); + labelEncoderOutput = ctx.AddIntermediateVariable(new VectorDataViewType(TextDataViewType.Instance, srcShape), "CastNodeOutput"); opType = "LabelEncoder"; var node = ctx.CreateNode(opType, castNodeOutput, labelEncoderOutput, ctx.GetNodeName(opType)); diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs index cfc315187b..27257b168f 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs @@ -787,7 +787,8 @@ private IEnumerable GetTermsAndIds(int iinfo, out long[] termIds) private void CastInputToString(OnnxContext ctx, out OnnxNode node, out long[] termIds, string srcVariableName, int iinfo, string opType, string labelEncoderOutput) { - var castOutput = ctx.AddIntermediateVariable(TextDataViewType.Instance, "castOutput"); + var srcShape = ctx.RetrieveShapeOrNull(srcVariableName); + var castOutput = ctx.AddIntermediateVariable(new VectorDataViewType(TextDataViewType.Instance, (int)srcShape[1]), "castOutput"); var castNode = ctx.CreateNode("Cast", srcVariableName, castOutput, ctx.GetNodeName("Cast"), ""); var t = InternalDataKindExtensions.ToInternalDataKind(DataKind.String).ToType(); castNode.AddAttribute("to", t); @@ -799,7 +800,8 @@ private void CastInputToString(OnnxContext ctx, out OnnxNode node, out long[] private void CastInputToFloat(OnnxContext ctx, out OnnxNode node, out long[] termIds, string srcVariableName, int iinfo, string opType, string labelEncoderOutput) { - var castOutput = ctx.AddIntermediateVariable(NumberDataViewType.Single, "castOutput"); + var srcShape = ctx.RetrieveShapeOrNull(srcVariableName); + var castOutput = ctx.AddIntermediateVariable(new VectorDataViewType(NumberDataViewType.Single, (int)srcShape[1]), "castOutput"); var castNode = ctx.CreateNode("Cast", srcVariableName, castOutput, ctx.GetNodeName("Cast"), ""); var t = InternalDataKindExtensions.ToInternalDataKind(DataKind.Single).ToType(); castNode.AddAttribute("to", t); @@ -813,7 +815,7 @@ private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string src long[] termIds; string opType = "LabelEncoder"; OnnxNode castNode; - var labelEncoderOutput = ctx.AddIntermediateVariable(NumberDataViewType.Int64, "LabelEncoderOutput"); + var labelEncoderOutput = ctx.AddIntermediateVariable(new VectorDataViewType(NumberDataViewType.Int64, _types[iinfo].GetValueCount()), "LabelEncoderOutput"); var type = info.TypeSrc.GetItemType(); if (type.Equals(TextDataViewType.Instance)) From 5655100b28fb583656980f2dba899decf86f521f Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Tue, 5 May 2020 12:00:43 -0700 Subject: [PATCH 03/10] adding value2key test --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 43 ++++++++++++------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 6feb2ec633..2fb2cd43f5 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1331,27 +1331,38 @@ public void ValueToKeyMappingOnnxConversionTest(DataKind valueType) var mlContext = new MLContext(seed: 1); string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") : GetDataPath("type-conversion.txt"); - TextLoader.Column[] columns = new[] + TextLoader.Column[] columnsVector = new[] +{ + new TextLoader.Column("Value", valueType, 0, 3) + }; + TextLoader.Column[] columnsScalar = new[] { - new TextLoader.Column("Value", valueType, 0, 0) + new TextLoader.Column("Value", valueType, 0) + }; + IDataView[] dataViews = { + mlContext.Data.LoadFromTextFile(filePath, columnsScalar, separatorChar: '\t'), //scalar + mlContext.Data.LoadFromTextFile(filePath, columnsVector , separatorChar: '\t') //vector }; - var dataView = mlContext.Data.LoadFromTextFile(filePath, columns); - var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Key", "Value"); - var model = pipeline.Fit(dataView); - var mlnetResult = model.Transform(dataView); + for (int j = 0; j < dataViews.Length; j++) + { + var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Key", "Value"); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - var onnxFileName = "ValueToKey.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); + var model = pipeline.Fit(dataViews[j]); + var mlnetResult = model.Transform(dataViews[j]); - if (IsOnnxRuntimeSupported()) - { - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); - CompareSelectedColumns("Key", "Key", mlnetResult, onnxResult); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataViews[j]); + var onnxFileName = "ValueToKey.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + + if (IsOnnxRuntimeSupported()) + { + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataViews[j]); + var onnxResult = onnxTransformer.Transform(dataViews[j]); + CompareSelectedColumns("Key", "Key", mlnetResult, onnxResult); + } } Done(); } From 540ba50fd44ac1f45630e4d5d533d504b3c32f61 Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Tue, 5 May 2020 14:58:29 -0700 Subject: [PATCH 04/10] keytovector fix --- .../Transforms/KeyToVector.cs | 21 ++++++++++++++----- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 20 +++++++++++------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 20309ea883..c47109f302 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -695,21 +695,32 @@ private void SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string src Contracts.CheckValue(shape, nameof(shape)); string opType = "Cast"; - var castOutput = ctx.AddIntermediateVariable(NumberDataViewType.Int64, opType); + var srcShape = ctx.RetrieveShapeOrNull(srcVariableName); + var castOutput = ctx.AddIntermediateVariable(new VectorDataViewType(NumberDataViewType.Int64, (int)srcShape[1]), opType); var castNode = ctx.CreateNode(opType, srcVariableName, castOutput, ctx.GetNodeName(opType), ""); castNode.AddAttribute("to", typeof(long)); opType = "OneHotEncoder"; + var isOutputCountVector = _parent._columns[iinfo].OutputCountVector; var categoryRange = info.TypeSrc.GetItemType().GetKeyCountAsInt32(Host); - var encodedVariableName = ctx.AddIntermediateVariable(new VectorDataViewType(NumberDataViewType.Single, 1, categoryRange), "encoded"); + var typeShape = ((int)shape[1] > 1 & isOutputCountVector) ? new VectorDataViewType(NumberDataViewType.Single, (int)shape[1], categoryRange) : new VectorDataViewType(NumberDataViewType.Single, categoryRange); + + var encodedVariableName = (isOutputCountVector) ? ctx.AddIntermediateVariable(typeShape, "encoded") : dstVariableName; var node = ctx.CreateNode(opType, castOutput, encodedVariableName, ctx.GetNodeName(opType)); node.AddAttribute("cats_int64s", Enumerable.Range(1, categoryRange).Select(x => (long)x)); node.AddAttribute("zeros", true); + if (_parent._columns[iinfo].OutputCountVector) + { + opType = "ReduceSum"; + var reduceNode = ctx.CreateNode(opType, encodedVariableName, dstVariableName, ctx.GetNodeName(opType), ""); + reduceNode.AddAttribute("axes", new long[] { shape.Count - 1 }); + reduceNode.AddAttribute("keepdims", 0); + } // OneHotEncoder adds one additional dimension, so we remove it below - opType = "Squeeze"; - var reduceNode = ctx.CreateNode(opType, encodedVariableName, dstVariableName, ctx.GetNodeName(opType), ""); - reduceNode.AddAttribute("axes", new long[] { shape.Count - 1 }); + //opType = "Squeeze"; + //var reduceNode = ctx.CreateNode(opType, encodedVariableName, dstVariableName, ctx.GetNodeName(opType), ""); + //reduceNode.AddAttribute("axes", new long[] { shape.Count - 1 }); } } } diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 2fb2cd43f5..594db5f089 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -550,13 +550,16 @@ public void CommandLineOnnxConversionTest() public void KeyToVectorTest(OneHotEncodingEstimator.OutputKind outputKind) { var mlContext = new MLContext(seed: 1); + string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") : GetDataPath("type-conversion.txt"); - string dataPath = GetDataPath("breast-cancer.txt"); - - var data = mlContext.Data.LoadFromTextFile(dataPath, - separatorChar: '\t', - hasHeader: true); - + TextLoader.Column[] columnsVector = new[] +{ + new TextLoader.Column("Value", valueType, 0, 3) + }; + TextLoader.Column[] columnsScalar = new[] + { + new TextLoader.Column("Value", valueType, 0) + }; var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", outputKind); var model = pipeline.Fit(data); @@ -1326,10 +1329,11 @@ public void IndicateMissingValuesOnnxConversionTest() [InlineData(DataKind.Double)] [InlineData(DataKind.String)] [InlineData(DataKind.Boolean)] - public void ValueToKeyMappingOnnxConversionTest(DataKind valueType) + public void ValueToKeyMappingOnnxConversionTest(Combinatorial[()]DataKind valueType) { var mlContext = new MLContext(seed: 1); - string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") : GetDataPath("type-conversion.txt"); + string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") + : GetDataPath("type-conversion.txt"); TextLoader.Column[] columnsVector = new[] { From 2fadb2152799c2fb5246ac52ea85c2f2f7ab5708 Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Tue, 5 May 2020 16:12:11 -0700 Subject: [PATCH 05/10] adding test --- .../Transforms/KeyToVector.cs | 7 +- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 82 ++++++++++--------- 2 files changed, 49 insertions(+), 40 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index c47109f302..fa4ea78e8e 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -703,14 +703,15 @@ private void SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string src opType = "OneHotEncoder"; var isOutputCountVector = _parent._columns[iinfo].OutputCountVector; var categoryRange = info.TypeSrc.GetItemType().GetKeyCountAsInt32(Host); - var typeShape = ((int)shape[1] > 1 & isOutputCountVector) ? new VectorDataViewType(NumberDataViewType.Single, (int)shape[1], categoryRange) : new VectorDataViewType(NumberDataViewType.Single, categoryRange); + var typeShape = new VectorDataViewType(NumberDataViewType.Single, info.TypeSrc.GetValueCount(), categoryRange); - var encodedVariableName = (isOutputCountVector) ? ctx.AddIntermediateVariable(typeShape, "encoded") : dstVariableName; + var encodedVariableName = (isOutputCountVector && info.TypeSrc is VectorDataViewType) ? + ctx.AddIntermediateVariable(typeShape, "encoded") : dstVariableName; var node = ctx.CreateNode(opType, castOutput, encodedVariableName, ctx.GetNodeName(opType)); node.AddAttribute("cats_int64s", Enumerable.Range(1, categoryRange).Select(x => (long)x)); node.AddAttribute("zeros", true); - if (_parent._columns[iinfo].OutputCountVector) + if (_parent._columns[iinfo].OutputCountVector && info.TypeSrc is VectorDataViewType) { opType = "ReduceSum"; var reduceNode = ctx.CreateNode(opType, encodedVariableName, dstVariableName, ctx.GetNodeName(opType), ""); diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 594db5f089..9fa372c45a 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -547,44 +547,55 @@ public void CommandLineOnnxConversionTest() [Theory] [CombinatorialData] - public void KeyToVectorTest(OneHotEncodingEstimator.OutputKind outputKind) + public void KeyToVectorTest([CombinatorialValues(DataKind.Single, DataKind.Int64, DataKind.Int32, DataKind.Int16, DataKind.UInt64, + DataKind.UInt32, DataKind.UInt16, DataKind.Double, DataKind.String, DataKind.Boolean)] DataKind valueType, + OneHotEncodingEstimator.OutputKind outputKind) { var mlContext = new MLContext(seed: 1); string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") : GetDataPath("type-conversion.txt"); TextLoader.Column[] columnsVector = new[] { - new TextLoader.Column("Value", valueType, 0, 3) + new TextLoader.Column("Key", valueType, 0, 3) }; TextLoader.Column[] columnsScalar = new[] { - new TextLoader.Column("Value", valueType, 0) + new TextLoader.Column("Key", valueType, 0) + }; + IDataView[] dataViews = { + mlContext.Data.LoadFromTextFile(filePath, columnsScalar, separatorChar: '\t'), //scalar + mlContext.Data.LoadFromTextFile(filePath, columnsVector , separatorChar: '\t') //vector }; - var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", outputKind); - var model = pipeline.Fit(data); - var transformedData = model.Transform(data); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("Vector", "Key", outputKind); - // Check ONNX model's text format. We save the produced ONNX model as a text file and compare it against - // the associated file in ML.NET repo. Such a comparison can be retired if ONNXRuntime ported to ML.NET - // can support Linux and Mac. - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "BinaryClassification", "BreastCancer"); - var onnxTextName = "KeyToVector.txt"; - var onnxFileName = "KeyToVector.onnx"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - var onnxModelPath = GetOutputPath(subDir, onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, onnxTextPath); - - // Binary OutputKind is currently not supported. - if (IsOnnxRuntimeSupported() && OneHotEncodingEstimator.OutputKind.Binary != outputKind) + for (int j = 0; j < dataViews.Length; j++) { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(data); - var onnxResult = onnxTransformer.Transform(data); - CompareResults("F2", "F2", transformedData, onnxResult); + if (OneHotEncodingEstimator.OutputKind.Binary == outputKind) break; // not currently supported + var model = pipeline.Fit(dataViews[j]); + var transformedData = model.Transform(dataViews[j]); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataViews[j]); + + // Check ONNX model's text format. We save the produced ONNX model as a text file and compare it against + // the associated file in ML.NET repo. Such a comparison can be retired if ONNXRuntime ported to ML.NET + // can support Linux and Mac. + var onnxTextName = "KeyToVector.txt"; + var onnxFileName = "KeyToVector.onnx"; + var onnxTextPath = GetOutputPath(onnxTextName); + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, onnxTextPath); + + // Binary OutputKind is currently not supported. + if (IsOnnxRuntimeSupported() && OneHotEncodingEstimator.OutputKind.Binary != outputKind) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataViews[j]); + var onnxResult = onnxTransformer.Transform(dataViews[j]); + CompareResults("Vector", "Vector", transformedData, onnxResult); + } } + Done(); } @@ -1319,24 +1330,19 @@ public void IndicateMissingValuesOnnxConversionTest() } [Theory] - [InlineData(DataKind.Single)] - [InlineData(DataKind.Int64)] - [InlineData(DataKind.Int32)] - [InlineData(DataKind.Int16)] - [InlineData(DataKind.UInt64)] - [InlineData(DataKind.UInt32)] - [InlineData(DataKind.UInt16)] - [InlineData(DataKind.Double)] - [InlineData(DataKind.String)] - [InlineData(DataKind.Boolean)] - public void ValueToKeyMappingOnnxConversionTest(Combinatorial[()]DataKind valueType) + [CombinatorialData] + public void ValueToKeyMappingOnnxConversionTest( + [CombinatorialValues(DataKind.Single, DataKind.Int64, DataKind.Int32, DataKind.Int16, DataKind.UInt64, + DataKind.UInt32, DataKind.UInt16, DataKind.Double, DataKind.String, DataKind.Boolean)] DataKind valueType, + [CombinatorialValues(1,2)] int maximumNumberOfKeys, ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality, + bool addKeyValueAnnotationsAsText) { var mlContext = new MLContext(seed: 1); string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") : GetDataPath("type-conversion.txt"); TextLoader.Column[] columnsVector = new[] -{ + { new TextLoader.Column("Value", valueType, 0, 3) }; TextLoader.Column[] columnsScalar = new[] @@ -1350,7 +1356,9 @@ public void ValueToKeyMappingOnnxConversionTest(Combinatorial[()]DataKind valueT for (int j = 0; j < dataViews.Length; j++) { - var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Key", "Value"); + var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Key", "Value", + maximumNumberOfKeys:maximumNumberOfKeys, keyOrdinality:keyOrdinality, + addKeyValueAnnotationsAsText: addKeyValueAnnotationsAsText); var model = pipeline.Fit(dataViews[j]); var mlnetResult = model.Transform(dataViews[j]); From 5d95f586892f75da388472252afed5166a2680a0 Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Tue, 5 May 2020 16:35:01 -0700 Subject: [PATCH 06/10] format changes --- .../Transforms/KeyToVector.cs | 16 +++------ test/Microsoft.ML.Tests/OnnxConversionTest.cs | 35 ++++++++++--------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index fa4ea78e8e..cf04031325 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -689,21 +689,17 @@ private JToken SaveAsPfaCore(BoundPfaContext ctx, int iinfo, ColInfo info, JToke private void SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string srcVariableName, string dstVariableName) { - var shape = ctx.RetrieveShapeOrNull(srcVariableName); - // Make sure that shape must present for calculating the reduction axes. The shape here is generally not null - // because inputs and outputs of a transform are declared with shapes. - Contracts.CheckValue(shape, nameof(shape)); + var dim = info.TypeSrc.GetValueCount(); string opType = "Cast"; - var srcShape = ctx.RetrieveShapeOrNull(srcVariableName); - var castOutput = ctx.AddIntermediateVariable(new VectorDataViewType(NumberDataViewType.Int64, (int)srcShape[1]), opType); + var castOutput = ctx.AddIntermediateVariable(new VectorDataViewType(NumberDataViewType.Int64, dim), opType); var castNode = ctx.CreateNode(opType, srcVariableName, castOutput, ctx.GetNodeName(opType), ""); castNode.AddAttribute("to", typeof(long)); opType = "OneHotEncoder"; var isOutputCountVector = _parent._columns[iinfo].OutputCountVector; var categoryRange = info.TypeSrc.GetItemType().GetKeyCountAsInt32(Host); - var typeShape = new VectorDataViewType(NumberDataViewType.Single, info.TypeSrc.GetValueCount(), categoryRange); + var typeShape = new VectorDataViewType(NumberDataViewType.Single, dim, categoryRange); var encodedVariableName = (isOutputCountVector && info.TypeSrc is VectorDataViewType) ? ctx.AddIntermediateVariable(typeShape, "encoded") : dstVariableName; @@ -715,13 +711,9 @@ private void SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string src { opType = "ReduceSum"; var reduceNode = ctx.CreateNode(opType, encodedVariableName, dstVariableName, ctx.GetNodeName(opType), ""); - reduceNode.AddAttribute("axes", new long[] { shape.Count - 1 }); + reduceNode.AddAttribute("axes", new long[] { 1 }); reduceNode.AddAttribute("keepdims", 0); } - // OneHotEncoder adds one additional dimension, so we remove it below - //opType = "Squeeze"; - //var reduceNode = ctx.CreateNode(opType, encodedVariableName, dstVariableName, ctx.GetNodeName(opType), ""); - //reduceNode.AddAttribute("axes", new long[] { shape.Count - 1 }); } } } diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 9fa372c45a..5163dd59d5 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -72,7 +72,8 @@ public void SimpleEndToEndOnnxConversionTest() var dynamicPipeline = mlContext.Transforms.NormalizeMinMax("FeatureVector") .AppendCacheCheckpoint(mlContext) - .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() { + .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() + { LabelColumnName = "Target", FeatureColumnName = "FeatureVector", NumberOfThreads = 1 @@ -496,7 +497,7 @@ public void LpNormOnnxConversionTest( }; var dataView = mlContext.Data.LoadFromEnumerable(samples); - var pipe = mlContext.Transforms.NormalizeLpNorm(nameof(DataPoint.Features), norm:norm, ensureZeroMean: ensureZeroMean); + var pipe = mlContext.Transforms.NormalizeLpNorm(nameof(DataPoint.Features), norm: norm, ensureZeroMean: ensureZeroMean); var model = pipe.Fit(dataView); var transformedData = model.Transform(dataView); @@ -555,14 +556,15 @@ public void KeyToVectorTest([CombinatorialValues(DataKind.Single, DataKind.Int64 string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") : GetDataPath("type-conversion.txt"); TextLoader.Column[] columnsVector = new[] -{ + { new TextLoader.Column("Key", valueType, 0, 3) }; TextLoader.Column[] columnsScalar = new[] { new TextLoader.Column("Key", valueType, 0) }; - IDataView[] dataViews = { + IDataView[] dataViews = + { mlContext.Data.LoadFromTextFile(filePath, columnsScalar, separatorChar: '\t'), //scalar mlContext.Data.LoadFromTextFile(filePath, columnsVector , separatorChar: '\t') //vector }; @@ -571,7 +573,7 @@ public void KeyToVectorTest([CombinatorialValues(DataKind.Single, DataKind.Int64 for (int j = 0; j < dataViews.Length; j++) { - if (OneHotEncodingEstimator.OutputKind.Binary == outputKind) break; // not currently supported + if (OneHotEncodingEstimator.OutputKind.Binary == outputKind) break; var model = pipeline.Fit(dataViews[j]); var transformedData = model.Transform(dataViews[j]); var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataViews[j]); @@ -595,7 +597,6 @@ public void KeyToVectorTest([CombinatorialValues(DataKind.Single, DataKind.Int64 CompareResults("Vector", "Vector", transformedData, onnxResult); } } - Done(); } @@ -680,7 +681,8 @@ public void LogisticRegressionOnnxConversionTest() var dynamicPipeline = mlContext.Transforms.NormalizeMinMax("FeatureVector") .AppendCacheCheckpoint(mlContext) - .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() { + .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() + { LabelColumnName = "Target", FeatureColumnName = "FeatureVector", NumberOfThreads = 1 @@ -1029,7 +1031,7 @@ public void TokenizingByCharactersOnnxConversionTest(bool useMarkerCharacters) var model = pipeline.Fit(dataView); var transformedData = model.Transform(dataView); var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - + // Compare model scores produced by ML.NET and ONNX's runtime. var onnxFileName = $"TokenizingByCharacters.onnx"; var onnxModelPath = GetOutputPath(onnxFileName); @@ -1096,7 +1098,7 @@ public void OnnxTypeConversionTest(DataKind fromKind, DataKind toKind) var mlContext = new MLContext(seed: 1); string filePath = GetDataPath("type-conversion.txt"); - TextLoader.Column[] columns = new [] + TextLoader.Column[] columns = new[] { new TextLoader.Column("Value", fromKind, 0, 0) }; @@ -1334,11 +1336,11 @@ public void IndicateMissingValuesOnnxConversionTest() public void ValueToKeyMappingOnnxConversionTest( [CombinatorialValues(DataKind.Single, DataKind.Int64, DataKind.Int32, DataKind.Int16, DataKind.UInt64, DataKind.UInt32, DataKind.UInt16, DataKind.Double, DataKind.String, DataKind.Boolean)] DataKind valueType, - [CombinatorialValues(1,2)] int maximumNumberOfKeys, ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality, + [CombinatorialValues(1, 2)] int maximumNumberOfKeys, ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality, bool addKeyValueAnnotationsAsText) { var mlContext = new MLContext(seed: 1); - string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") + string filePath = (valueType == DataKind.Boolean) ? GetDataPath("type-conversion-boolean.txt") : GetDataPath("type-conversion.txt"); TextLoader.Column[] columnsVector = new[] @@ -1349,7 +1351,8 @@ public void ValueToKeyMappingOnnxConversionTest( { new TextLoader.Column("Value", valueType, 0) }; - IDataView[] dataViews = { + IDataView[] dataViews = + { mlContext.Data.LoadFromTextFile(filePath, columnsScalar, separatorChar: '\t'), //scalar mlContext.Data.LoadFromTextFile(filePath, columnsVector , separatorChar: '\t') //vector }; @@ -1357,7 +1360,7 @@ public void ValueToKeyMappingOnnxConversionTest( for (int j = 0; j < dataViews.Length; j++) { var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Key", "Value", - maximumNumberOfKeys:maximumNumberOfKeys, keyOrdinality:keyOrdinality, + maximumNumberOfKeys: maximumNumberOfKeys, keyOrdinality: keyOrdinality, addKeyValueAnnotationsAsText: addKeyValueAnnotationsAsText); var model = pipeline.Fit(dataViews[j]); @@ -1403,8 +1406,8 @@ public void KeyToValueMappingOnnxConversionTest(DataKind valueType) { new TextLoader.Column("Value", valueType, 0) }; - - IDataView[] dataViews = { + IDataView[] dataViews = + { mlContext.Data.LoadFromTextFile(filePath, columnsScalar, separatorChar: '\t'), //scalar mlContext.Data.LoadFromTextFile(filePath, columnsVector , separatorChar: '\t') //vector }; @@ -1545,7 +1548,7 @@ public void NgramOnnxConversionTest( CompareSelectedColumns(columnName, columnName, transformedData, onnxResult, 3); VBuffer> mlNetSlots = default; - VBuffer> onnxSlots= default; + VBuffer> onnxSlots = default; transformedData.Schema[columnName].GetSlotNames(ref mlNetSlots); onnxResult.Schema[columnName].GetSlotNames(ref onnxSlots); Assert.Equal(mlNetSlots.Length, onnxSlots.Length); From 472458279bdcb17ff0dbbaa341daed9367502b58 Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Wed, 6 May 2020 12:51:51 -0700 Subject: [PATCH 07/10] adding baselines --- .../ExcludeVariablesInOnnxConversion.txt | 42 +------------------ .../BreastCancer/ModelWithLessIO.txt | 42 +------------------ 2 files changed, 2 insertions(+), 82 deletions(-) diff --git a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt index 8441d25ca0..b8cac6aa64 100644 --- a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt +++ b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt @@ -94,7 +94,7 @@ "Cast" ], "output": [ - "encoded" + "F21" ], "name": "OneHotEncoder", "opType": "OneHotEncoder", @@ -123,25 +123,6 @@ ], "domain": "ai.onnx.ml" }, - { - "input": [ - "encoded" - ], - "output": [ - "F21" - ], - "name": "Squeeze", - "opType": "Squeeze", - "attribute": [ - { - "name": "axes", - "ints": [ - "1" - ], - "type": "INTS" - } - ] - }, { "input": [ "F21" @@ -673,27 +654,6 @@ } } }, - { - "name": "encoded", - "type": { - "tensorType": { - "elemType": 1, - "shape": { - "dim": [ - { - "dimValue": "-1" - }, - { - "dimValue": "1" - }, - { - "dimValue": "10" - } - ] - } - } - } - }, { "name": "F22", "type": { diff --git a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt index 3b70a3b2e3..126e1ac29a 100644 --- a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt +++ b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt @@ -92,7 +92,7 @@ "Cast" ], "output": [ - "encoded" + "F21" ], "name": "OneHotEncoder", "opType": "OneHotEncoder", @@ -120,25 +120,6 @@ ], "domain": "ai.onnx.ml" }, - { - "input": [ - "encoded" - ], - "output": [ - "F21" - ], - "name": "Squeeze", - "opType": "Squeeze", - "attribute": [ - { - "name": "axes", - "ints": [ - "1" - ], - "type": "INTS" - } - ] - }, { "input": [ "F1", @@ -1022,27 +1003,6 @@ } } }, - { - "name": "encoded", - "type": { - "tensorType": { - "elemType": 1, - "shape": { - "dim": [ - { - "dimValue": "-1" - }, - { - "dimValue": "1" - }, - { - "dimValue": "9" - } - ] - } - } - } - }, { "name": "VectorFeaturizerOutput", "type": { From 44970383c0578ae37bef9be7c0caccdbca9c2fab Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Wed, 6 May 2020 14:36:51 -0700 Subject: [PATCH 08/10] fixing unknown value mapping --- .../Transforms/ValueToKeyMappingTransformer.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs index 27257b168f..fb7e35b9c5 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs @@ -878,7 +878,10 @@ private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string src return false; } - node.AddAttribute("default_int64", -1); + //Unknown keys should map to 0 + node.AddAttribute("default_int64", 0); + node.AddAttribute("default_string", "0"); + node.AddAttribute("default_float", 0f); node.AddAttribute("values_int64s", termIds); // Onnx outputs an Int64, but ML.NET outputs a keytype. So cast it here From ebea3a04ad963a91be9f284473c428a9b7da40da Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Wed, 6 May 2020 15:28:45 -0700 Subject: [PATCH 09/10] updating baselines --- .../BreastCancer/ExcludeVariablesInOnnxConversion.txt | 10 +++++++++- .../BreastCancer/ModelWithLessIO.txt | 10 +++++++++- ...sificationLogisticRegressionSaveModelToOnnxTest.txt | 10 +++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt index b8cac6aa64..d24d7e1c3f 100644 --- a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt +++ b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt @@ -33,9 +33,17 @@ }, { "name": "default_int64", - "i": "-1", "type": "INT" }, + { + "name": "default_string", + "s": "MA==", + "type": "STRING" + }, + { + "name": "default_float", + "type": "FLOAT" + }, { "name": "values_int64s", "ints": [ diff --git a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt index 126e1ac29a..9091a61338 100644 --- a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt +++ b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt @@ -32,9 +32,17 @@ }, { "name": "default_int64", - "i": "-1", "type": "INT" }, + { + "name": "default_string", + "s": "MA==", + "type": "STRING" + }, + { + "name": "default_float", + "type": "FLOAT" + }, { "name": "values_int64s", "ints": [ diff --git a/test/BaselineOutput/Common/Onnx/MultiClassClassification/BreastCancer/MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt b/test/BaselineOutput/Common/Onnx/MultiClassClassification/BreastCancer/MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt index 799ca5125b..50a604c84c 100644 --- a/test/BaselineOutput/Common/Onnx/MultiClassClassification/BreastCancer/MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt +++ b/test/BaselineOutput/Common/Onnx/MultiClassClassification/BreastCancer/MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt @@ -99,9 +99,17 @@ }, { "name": "default_int64", - "i": "-1", "type": "INT" }, + { + "name": "default_string", + "s": "MA==", + "type": "STRING" + }, + { + "name": "default_float", + "type": "FLOAT" + }, { "name": "values_int64s", "ints": [ From 14bae44351502d23bb9f2b3d5ceeff52a5220d7d Mon Sep 17 00:00:00 2001 From: Lynx1820 Date: Thu, 7 May 2020 14:41:57 -0700 Subject: [PATCH 10/10] adding new line --- src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs index fb7e35b9c5..297b475159 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs @@ -809,6 +809,7 @@ private void CastInputToFloat(OnnxContext ctx, out OnnxNode node, out long[] var terms = GetTermsAndIds(iinfo, out termIds); node.AddAttribute("keys_floats", terms.Select(item => Convert.ToSingle(item))); } + private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string srcVariableName, string dstVariableName) { OnnxNode node;