diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/Converter/SweepablePipelineConverter.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/Converter/SweepablePipelineConverter.cs new file mode 100644 index 0000000000..a38ce402d7 --- /dev/null +++ b/src/Microsoft.ML.AutoML/SweepableEstimator/Converter/SweepablePipelineConverter.cs @@ -0,0 +1,35 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text.Json; +using System.Text.Json.Nodes; +using System.Text.Json.Serialization; + +namespace Microsoft.ML.AutoML +{ + internal class SweepablePipelineConverter : JsonConverter + { + public override SweepablePipeline Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + var jValue = JsonValue.Parse(ref reader); + var currentSchema = jValue["currentSchema"].GetValue(); + var schema = jValue["schema"].GetValue(); + var estimators = jValue["estimator"].GetValue>(); + + return new SweepablePipeline(estimators, Entity.FromExpression(schema), currentSchema); + } + + public override void Write(Utf8JsonWriter writer, SweepablePipeline value, JsonSerializerOptions options) + { + var jsonObject = JsonNode.Parse("{}"); + jsonObject["schema"] = value.Schema.ToString(); + jsonObject["currentSchema"] = value.CurrentParameter["_SCHEMA_"].AsType(); + jsonObject["estimators"] = JsonValue.Create(value.Estimators); + + jsonObject.WriteTo(writer, options); + } + } +} diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/ISweepable.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/ISweepable.cs new file mode 100644 index 0000000000..5e7a6f681a --- /dev/null +++ b/src/Microsoft.ML.AutoML/SweepableEstimator/ISweepable.cs @@ -0,0 +1,22 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text; +using Microsoft.ML.SearchSpace; + +namespace Microsoft.ML.AutoML +{ + internal interface ISweepable + { + public SearchSpace.SearchSpace SearchSpace { get; } + } + + internal interface ISweepable : ISweepable + where T : IEstimator + { + public T BuildFromOption(MLContext context, Parameter parameter); + } +} diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/SweepableEstimator.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/SweepableEstimator.cs index 35a6e0f3a8..b7228d78b9 100644 --- a/src/Microsoft.ML.AutoML/SweepableEstimator/SweepableEstimator.cs +++ b/src/Microsoft.ML.AutoML/SweepableEstimator/SweepableEstimator.cs @@ -14,7 +14,7 @@ namespace Microsoft.ML.AutoML /// Estimator with search space. /// [JsonConverter(typeof(SweepableEstimatorConverter))] - public class SweepableEstimator : Estimator + public class SweepableEstimator : Estimator, ISweepable> { private readonly Func> _factory; @@ -70,6 +70,7 @@ public TOption TParameter public override IEstimator BuildFromOption(MLContext context, Parameter param) { + this.Parameter = param; return BuildFromOption(context, param.AsType()); } } diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/SweepablePipeline.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/SweepablePipeline.cs new file mode 100644 index 0000000000..37c4ac385e --- /dev/null +++ b/src/Microsoft.ML.AutoML/SweepableEstimator/SweepablePipeline.cs @@ -0,0 +1,220 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.Json.Serialization; +using Microsoft.ML.Data; +using Microsoft.ML.SearchSpace; +using Microsoft.ML.SearchSpace.Option; + +namespace Microsoft.ML.AutoML +{ + [JsonConverter(typeof(SweepablePipelineConverter))] + internal class SweepablePipeline : ISweepable> + { + private readonly Entity _schema; + private const string SchemaOption = "_SCHEMA_"; + private readonly Dictionary _estimators = new Dictionary(); + private static readonly StringEntity _nilStringEntity = new StringEntity("Nil"); + private static readonly EstimatorEntity _nilSweepableEntity = new EstimatorEntity(null); + private string _currentSchema; + + public SearchSpace.SearchSpace SearchSpace + { + get + { + var searchSpace = new SearchSpace.SearchSpace(); + var kvPairs = _estimators.Select((e, i) => new KeyValuePair(i.ToString(), e.Value.SearchSpace)); + foreach (var kv in kvPairs) + { + if (kv.Value != null) + { + searchSpace.Add(kv.Key, kv.Value); + } + } + + var schemaOptions = _schema.ToTerms().Select(t => t.ToString()).ToArray(); + var choiceOption = new ChoiceOption(schemaOptions); + searchSpace.Add(SchemaOption, choiceOption); + + return searchSpace; + } + } + + public Parameter CurrentParameter + { + get + { + var parameter = Parameter.CreateNestedParameter(); + var kvPairs = _estimators.Select((e, i) => new KeyValuePair(i.ToString(), e.Value.Parameter)); + foreach (var kv in kvPairs) + { + if (kv.Value != null) + { + parameter[kv.Key] = kv.Value; + } + } + + parameter[SchemaOption] = Parameter.FromString(_currentSchema); + return parameter; + } + } + + internal SweepablePipeline() + { + _estimators = new Dictionary(); + _schema = null; + } + + internal SweepablePipeline(Dictionary estimators, Entity schema, string currentSchema = null) + { + _estimators = estimators; + _schema = schema; + _currentSchema = currentSchema ?? schema.ToTerms().First().ToString(); + } + + public Dictionary Estimators { get => _estimators; } + + + internal Entity Schema { get => _schema; } + + public EstimatorChain BuildFromOption(MLContext context, Parameter parameter) + { + _currentSchema = parameter[SchemaOption].AsType(); + var estimators = Entity.FromExpression(_currentSchema) + .ValueEntities() + .Where(e => e is StringEntity se && se.Value != "Nil") + .Select((se) => _estimators[((StringEntity)se).Value]); + + var pipeline = new SweepableEstimatorPipeline(estimators); + return pipeline.BuildTrainingPipeline(context, parameter); + } + + public SweepablePipeline Append(params ISweepable>[] sweepables) + { + Entity entity = null; + foreach (var sweepable in sweepables) + { + if (sweepable is SweepableEstimator estimator) + { + if (entity == null) + { + entity = new EstimatorEntity(estimator); + continue; + } + else + { + entity += estimator; + } + } + else if (sweepable is SweepablePipeline pipeline) + { + if (entity == null) + { + entity = CreateSweepableEntityFromEntity(pipeline._schema, pipeline._estimators); + continue; + } + else + { + entity += CreateSweepableEntityFromEntity(pipeline._schema, pipeline._estimators); + } + } + } + + return AppendEntity(false, entity); + } + + private SweepablePipeline AppendEntity(bool allowSkip, Entity entity) + { + var estimators = _estimators.ToDictionary(x => x.Key, x => x.Value); + var stringEntity = VisitAndReplaceSweepableEntityWithStringEntity(entity, ref estimators); + if (allowSkip) + { + stringEntity += _nilStringEntity; + } + + var schema = _schema; + if (schema == null) + { + schema = stringEntity; + } + else + { + schema *= stringEntity; + } + + return new SweepablePipeline(estimators, schema); + } + + private Entity CreateSweepableEntityFromEntity(Entity entity, Dictionary lookupTable) + { + if (entity is null) + { + return null; + } + + if (entity is StringEntity stringEntity) + { + if (stringEntity == _nilStringEntity) + { + return _nilSweepableEntity; + } + + return new EstimatorEntity(lookupTable[stringEntity.Value]); + } + else if (entity is ConcatenateEntity concatenateEntity) + { + return new ConcatenateEntity() + { + Left = CreateSweepableEntityFromEntity(concatenateEntity.Left, lookupTable), + Right = CreateSweepableEntityFromEntity(concatenateEntity.Right, lookupTable), + }; + } + else if (entity is OneOfEntity oneOfEntity) + { + return new OneOfEntity() + { + Left = CreateSweepableEntityFromEntity(oneOfEntity.Left, lookupTable), + Right = CreateSweepableEntityFromEntity(oneOfEntity.Right, lookupTable), + }; + } + + throw new ArgumentException(); + } + + private Entity VisitAndReplaceSweepableEntityWithStringEntity(Entity e, ref Dictionary estimators) + { + if (e is null) + { + return null; + } + + if (e is EstimatorEntity sweepableEntity0) + { + if (sweepableEntity0 == _nilSweepableEntity) + { + return _nilStringEntity; + } + + var id = GetNextId(estimators); + estimators[id] = (SweepableEstimator)sweepableEntity0.Estimator; + return new StringEntity(id); + } + + e.Left = VisitAndReplaceSweepableEntityWithStringEntity(e.Left, ref estimators); + e.Right = VisitAndReplaceSweepableEntityWithStringEntity(e.Right, ref estimators); + + return e; + } + + private string GetNextId(Dictionary estimators) + { + var count = estimators.Count(); + return "e" + count.ToString(); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepablePipelineTests.SweepablePipeline_Append_SweepableEstimator_Test.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepablePipelineTests.SweepablePipeline_Append_SweepableEstimator_Test.approved.txt new file mode 100644 index 0000000000..862b1ba2a7 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepablePipelineTests.SweepablePipeline_Append_SweepableEstimator_Test.approved.txt @@ -0,0 +1,267 @@ +{ + "schema": "e0 * (e1 \u002B e2) * (e3 \u002B e4 * (e5 \u002B e6)) * (e7 * (e8 \u002B e9) * (e10 \u002B e11 * (e12 \u002B e13)) \u002B e14 * (e15 \u002B e16) * (e17 \u002B e18 * (e19 \u002B e20)))", + "currentSchema": "e0 * e1 * e3 * e7 * e8 * e10", + "estimators": { + "e0": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e1": { + "estimatorType": "LightGbmBinary", + "parameter": { + "NumberOfLeaves": 4, + "MinimumExampleCountPerLeaf": 20, + "LearningRate": 1, + "NumberOfTrees": 4, + "SubsampleFraction": 1, + "MaximumBinCountPerFeature": 256, + "FeatureFraction": 1, + "L1Regularization": 0.0000000002, + "L2Regularization": 1, + "LabelColumnName": "Label", + "FeatureColumnName": "Feature" + } + }, + "e2": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e3": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e4": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e5": { + "estimatorType": "LightGbmBinary", + "parameter": { + "NumberOfLeaves": 4, + "MinimumExampleCountPerLeaf": 20, + "LearningRate": 1, + "NumberOfTrees": 4, + "SubsampleFraction": 1, + "MaximumBinCountPerFeature": 256, + "FeatureFraction": 1, + "L1Regularization": 0.0000000002, + "L2Regularization": 1, + "LabelColumnName": "Label", + "FeatureColumnName": "Feature" + } + }, + "e6": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e7": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e8": { + "estimatorType": "LightGbmBinary", + "parameter": { + "NumberOfLeaves": 4, + "MinimumExampleCountPerLeaf": 20, + "LearningRate": 1, + "NumberOfTrees": 4, + "SubsampleFraction": 1, + "MaximumBinCountPerFeature": 256, + "FeatureFraction": 1, + "L1Regularization": 0.0000000002, + "L2Regularization": 1, + "LabelColumnName": "Label", + "FeatureColumnName": "Feature" + } + }, + "e9": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e10": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e11": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e12": { + "estimatorType": "LightGbmBinary", + "parameter": { + "NumberOfLeaves": 4, + "MinimumExampleCountPerLeaf": 20, + "LearningRate": 1, + "NumberOfTrees": 4, + "SubsampleFraction": 1, + "MaximumBinCountPerFeature": 256, + "FeatureFraction": 1, + "L1Regularization": 0.0000000002, + "L2Regularization": 1, + "LabelColumnName": "Label", + "FeatureColumnName": "Feature" + } + }, + "e13": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e14": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e15": { + "estimatorType": "LightGbmBinary", + "parameter": { + "NumberOfLeaves": 4, + "MinimumExampleCountPerLeaf": 20, + "LearningRate": 1, + "NumberOfTrees": 4, + "SubsampleFraction": 1, + "MaximumBinCountPerFeature": 256, + "FeatureFraction": 1, + "L1Regularization": 0.0000000002, + "L2Regularization": 1, + "LabelColumnName": "Label", + "FeatureColumnName": "Feature" + } + }, + "e16": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e17": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e18": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + }, + "e19": { + "estimatorType": "LightGbmBinary", + "parameter": { + "NumberOfLeaves": 4, + "MinimumExampleCountPerLeaf": 20, + "LearningRate": 1, + "NumberOfTrees": 4, + "SubsampleFraction": 1, + "MaximumBinCountPerFeature": 256, + "FeatureFraction": 1, + "L1Regularization": 0.0000000002, + "L2Regularization": 1, + "LabelColumnName": "Label", + "FeatureColumnName": "Feature" + } + }, + "e20": { + "estimatorType": "Concatenate", + "parameter": { + "InputColumnNames": [ + "a", + "b", + "c" + ], + "OutputColumnName": "a" + } + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/SweepablePipelineTests.cs b/test/Microsoft.ML.AutoML.Tests/SweepablePipelineTests.cs new file mode 100644 index 0000000000..f07c7bdc06 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/SweepablePipelineTests.cs @@ -0,0 +1,73 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text; +using Microsoft.ML.AutoML.CodeGen; +using Microsoft.ML.TestFramework; +using Xunit; +using Xunit.Abstractions; +using ApprovalTests; +using ApprovalTests.Namers; +using ApprovalTests.Reporters; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Microsoft.ML.AutoML.Test +{ + public class SweepablePipelineTests : BaseTestClass + { + private readonly JsonSerializerOptions _jsonSerializerOptions; + + public SweepablePipelineTests(ITestOutputHelper output) : base(output) + { + _jsonSerializerOptions = new JsonSerializerOptions() + { + WriteIndented = true, + Converters = + { + new JsonStringEnumConverter(), new DoubleToDecimalConverter(), new FloatToDecimalConverter(), + }, + }; + + if (Environment.GetEnvironmentVariable("HELIX_CORRELATION_ID") != null) + { + Approvals.UseAssemblyLocationForApprovedFiles(); + } + } + + [Fact] + [UseReporter(typeof(DiffReporter))] + [UseApprovalSubdirectory("ApprovalTests")] + public void SweepablePipeline_Append_SweepableEstimator_Test() + { + var pipeline = new SweepablePipeline(); + var concatOption = new ConcatOption() + { + InputColumnNames = new List { "a", "b", "c" }.ToArray(), + OutputColumnName = "a", + }; + var lgbmOption = new LgbmOption() + { + FeatureColumnName = "Feature", + LabelColumnName = "Label", + }; + + // pipeline can append a single sweepable estimator + pipeline = pipeline.Append(SweepableEstimatorFactory.CreateConcatenate(concatOption)); + + // pipeline can append muliple sweepable estimators. + pipeline = pipeline.Append(SweepableEstimatorFactory.CreateLightGbmBinary(lgbmOption), SweepableEstimatorFactory.CreateConcatenate(concatOption)); + + // pipeline can append sweepable pipelines mixed with sweepble estimators + pipeline = pipeline.Append(SweepableEstimatorFactory.CreateConcatenate(concatOption), pipeline); + + // pipeline can append sweepable pipelines. + pipeline = pipeline.Append(pipeline, pipeline); + + Approvals.Verify(JsonSerializer.Serialize(pipeline, _jsonSerializerOptions)); + } + } +}