From 6ccf47923e4244f667af8fe8ef6eaa183e7dda3b Mon Sep 17 00:00:00 2001 From: Piotr Telman Date: Fri, 30 Oct 2020 18:03:51 +0100 Subject: [PATCH] Auto.ML: Fix issue when parsing float string fails on pl-PL culture set using Regression Experiment (#5163) * Fix issue when parsing float string fails on pl-PL culture set * Added InvariantCulture float parsing as per CodeReview request * Update src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs Co-authored-by: Justin Ormont * Update Parameters.cs * Added PL test * Added multiple cultures * debugging CI failure * Debug runSpecific * Revert "Debug runSpecific" This reverts commit 95b728099415cacbe8cf3819ec51ce50cec94eb2. * Removed LightGBM and addressed comments * Increased time * Increase time * Increased time Co-authored-by: Justin Ormont Co-authored-by: Antonio Velazquez --- .../Sweepers/Parameters.cs | 2 +- .../Sweepers/SweeperProbabilityUtils.cs | 11 +++- .../Microsoft.ML.AutoML.Tests/AutoFitTests.cs | 66 ++++++++++++++----- 3 files changed, 61 insertions(+), 18 deletions(-) diff --git a/src/Microsoft.ML.AutoML/Sweepers/Parameters.cs b/src/Microsoft.ML.AutoML/Sweepers/Parameters.cs index 2bf22db4c1..54765684d0 100644 --- a/src/Microsoft.ML.AutoML/Sweepers/Parameters.cs +++ b/src/Microsoft.ML.AutoML/Sweepers/Parameters.cs @@ -83,7 +83,7 @@ public LongParameterValue(string name, long value) { _name = name; _value = value; - _valueText = _value.ToString("D"); + _valueText = _value.ToString("D", CultureInfo.InvariantCulture); } public bool Equals(IParameterValue other) diff --git a/src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs b/src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs index f848a4e27b..89c13c152f 100644 --- a/src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs +++ b/src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.Globalization; using Microsoft.ML.Internal.CpuMath; namespace Microsoft.ML.AutoML @@ -98,13 +99,15 @@ public static float[] ParameterSetAsFloatArray(IValueGenerator[] sweepParams, Pa } else if (sweepParam is LongValueGenerator lvg) { + var longValue = GetIfIParameterValueOfT(pset) ?? long.Parse(pset.ValueText, CultureInfo.InvariantCulture); // Normalizing all numeric parameters to [0,1] range. - result.Add(lvg.NormalizeValue(new LongParameterValue(pset.Name, long.Parse(pset.ValueText)))); + result.Add(lvg.NormalizeValue(new LongParameterValue(pset.Name, longValue))); } else if (sweepParam is FloatValueGenerator fvg) { + var floatValue = GetIfIParameterValueOfT(pset) ?? float.Parse(pset.ValueText, CultureInfo.InvariantCulture); // Normalizing all numeric parameters to [0,1] range. - result.Add(fvg.NormalizeValue(new FloatParameterValue(pset.Name, float.Parse(pset.ValueText)))); + result.Add(fvg.NormalizeValue(new FloatParameterValue(pset.Name, floatValue))); } else { @@ -115,6 +118,10 @@ public static float[] ParameterSetAsFloatArray(IValueGenerator[] sweepParams, Pa return result.ToArray(); } + private static T? GetIfIParameterValueOfT(IParameterValue parameterValue) + where T : struct => + parameterValue is IParameterValue pvt ? pvt.Value : default(T?); + public static ParameterSet FloatArrayAsParameterSet(IValueGenerator[] sweepParams, float[] array, bool expandedCategoricals = true) { Runtime.Contracts.Assert(array.Length == sweepParams.Length); diff --git a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs index 40ccfdc067..bea3f97f3f 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs @@ -2,7 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; +using System.Globalization; using System.Linq; +using System.Threading; using Microsoft.ML.Data; using Microsoft.ML.TestFramework; using Microsoft.ML.TestFramework.Attributes; @@ -102,22 +105,55 @@ private void Context_Log(object sender, LoggingEventArgs e) //throw new NotImplementedException(); } - [Fact] - public void AutoFitRegressionTest() + [Theory] + [InlineData("en-US")] + [InlineData("ar-SA")] + [InlineData("pl-PL")] + public void AutoFitRegressionTest(string culture) { - var context = new MLContext(1); - var dataPath = DatasetUtil.GetMlNetGeneratedRegressionDataset(); - var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel); - var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions); - var trainData = textLoader.Load(dataPath); - var validationData = context.Data.TakeRows(trainData, 20); - trainData = context.Data.SkipRows(trainData, 20); - var result = context.Auto() - .CreateRegressionExperiment(0) - .Execute(trainData, validationData, - new ColumnInformation() { LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel }); + var originalCulture = Thread.CurrentThread.CurrentCulture; + try + { + Thread.CurrentThread.CurrentCulture = new CultureInfo(culture); + + // If users run AutoML with a different locale, sometimes + // the sweeper encounters problems when parsing some strings. + // So testing in another culture is necessary. + // Furthermore, these issues might only occur after ~70 + // iterations, so more experiment time is needed for this to + // occur. + uint experimentTime = (uint) (culture == "en-US" ? 0 : 180); + + var experimentSettings = new RegressionExperimentSettings { MaxExperimentTimeInSeconds = experimentTime}; + if (!Environment.Is64BitProcess) + { + // LightGBM isn't available on x86 machines + experimentSettings.Trainers.Remove(RegressionTrainer.LightGbm); + } + + var context = new MLContext(1); + var dataPath = DatasetUtil.GetMlNetGeneratedRegressionDataset(); + var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel); + var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions); + var trainData = textLoader.Load(dataPath); + var validationData = context.Data.TakeRows(trainData, 20); + trainData = context.Data.SkipRows(trainData, 20); + var result = context.Auto() + .CreateRegressionExperiment(experimentSettings) + .Execute(trainData, validationData, + new ColumnInformation() { LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel }); + + Assert.True(result.RunDetails.Max(i => i.ValidationMetrics.RSquared > 0.9)); + + // Ensure experimentTime allows enough iterations to fully test the internationalization code + // If the below assertion fails, increase the experiment time so the number of iterations is met + Assert.True(culture == "en-US" || result.RunDetails.Count() >= 75, $"RunDetails.Count() = {result.RunDetails.Count()}, below 75"); - Assert.True(result.RunDetails.Max(i => i.ValidationMetrics.RSquared > 0.9)); + } + finally + { + Thread.CurrentThread.CurrentCulture = originalCulture; + } } [LightGBMFact] @@ -351,4 +387,4 @@ private TextLoader.Options GetLoaderArgsRank(string labelColumnName, string grou }; } } -} +} \ No newline at end of file