From 4397e793759f53b3f8501669a95754c9cb2ec6e6 Mon Sep 17 00:00:00 2001 From: Anipik Date: Mon, 10 Dec 2018 14:32:16 -0800 Subject: [PATCH 1/5] Added rffBenchmark --- build.proj | 2 +- build/ExternalBenchmarkDataFiles.props | 1 + test/Microsoft.ML.Benchmarks/RffTransform.cs | 41 ++++++++++++++++++++ test/Microsoft.ML.TestFramework/Datasets.cs | 6 +++ test/data/README.md | 7 ++++ 5 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 test/Microsoft.ML.Benchmarks/RffTransform.cs diff --git a/build.proj b/build.proj index de8c507de7..15fea4e309 100644 --- a/build.proj +++ b/build.proj @@ -78,7 +78,7 @@ - https://aka.ms/tlc-resources/benchmarks/%(Identity) + https://aka.ms/mlnet-resources/benchmarks/%(Identity) $(MSBuildThisFileDirectory)/test/data/external/%(Identity) diff --git a/build/ExternalBenchmarkDataFiles.props b/build/ExternalBenchmarkDataFiles.props index ad3d350d60..42df4ccd96 100644 --- a/build/ExternalBenchmarkDataFiles.props +++ b/build/ExternalBenchmarkDataFiles.props @@ -1,5 +1,6 @@ + diff --git a/test/Microsoft.ML.Benchmarks/RffTransform.cs b/test/Microsoft.ML.Benchmarks/RffTransform.cs new file mode 100644 index 0000000000..52fe281d9c --- /dev/null +++ b/test/Microsoft.ML.Benchmarks/RffTransform.cs @@ -0,0 +1,41 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using BenchmarkDotNet.Attributes; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.RunTests; +using Microsoft.ML.Runtime.Tools; +using Microsoft.ML.Trainers.Online; +using Microsoft.ML.Transforms.Projections; +using System.IO; + +namespace Microsoft.ML.Benchmarks +{ + public class RffTransformTrain + { + private string _dataPath_Digits; + + [GlobalSetup] + public void SetupTrainingSpeedTests() + { + _dataPath_Digits = Path.GetFullPath(TestDatasets.Digits.trainFilename); + + if (!File.Exists(_dataPath_Digits)) + throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _dataPath_Digits)); + } + + [Benchmark] + public void CV_Multiclass_Digits_RffTransform_OVAAveragedPerceptron() + { + string cmd = @"CV k=5 data={" + _dataPath_Digits + "}" + + " loader=TextLoader{col=Label:R4:64 col=Features:R4:0-63 sep=,}" + + " xf=RffTransform{col=FeaturesRFF:Features}" + + " xf=Concat{col=Features:FeaturesRFF}" + + " tr=OVA{p=AveragedPerceptron{iter=10}}"; + + var environment = EnvironmentFactory.CreateClassificationEnvironment(); + Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); + } + } +} diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 5760df712d..f29c9d31cf 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -137,6 +137,12 @@ public static class TestDatasets loaderSettings = "xf=expr{col=Features expr=x:float(x>4?1:0)}" }; + public static TestDataset Digits = new TestDataset + { + name = "Digits", + trainFilename = @"external/digits.csv", + }; + public static TestDataset vw = new TestDataset { name = "vw", diff --git a/test/data/README.md b/test/data/README.md index 6a21ece35f..551ea2d0da 100644 --- a/test/data/README.md +++ b/test/data/README.md @@ -16,6 +16,13 @@ The datasets are provided under the original terms that Microsoft received such > >Original readme: https://meta.wikimedia.org/wiki/Research:Detox +### Digits +> This dataset is provided under [Scikit](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/data/digits.csv.gz). Redistributing the dataset "digits.csv" with attribution: +> +> Intermediate source: http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits +> +> Original Source: C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their Applications to Handwritten Digit Recognition, MSc Thesis, Institute of Graduate Studies in Science and Engineering, Bogazici University. + ### UCI Adult Dataset >Dua, D. and Karra Taniskidou, E. (2017). UCI Machine Learning Repository [https://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. From fff3f07f1f604ed90191ec3ad43184a93420e053 Mon Sep 17 00:00:00 2001 From: Anipik Date: Mon, 10 Dec 2018 15:53:04 -0800 Subject: [PATCH 2/5] scikit removed and another reference added --- test/data/README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/data/README.md b/test/data/README.md index 551ea2d0da..a5e2870da4 100644 --- a/test/data/README.md +++ b/test/data/README.md @@ -17,11 +17,10 @@ The datasets are provided under the original terms that Microsoft received such >Original readme: https://meta.wikimedia.org/wiki/Research:Detox ### Digits -> This dataset is provided under [Scikit](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/data/digits.csv.gz). Redistributing the dataset "digits.csv" with attribution: +> This dataset is provided under http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits. > -> Intermediate source: http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits -> -> Original Source: C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their Applications to Handwritten Digit Recognition, MSc Thesis, Institute of Graduate Studies in Science and Engineering, Bogazici University. +> References: C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their Applications to Handwritten Digit Recognition, MSc Thesis, Institute of Graduate Studies in Science and Engineering, Bogazici University. +> E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika. ### UCI Adult Dataset From ab2592768cd80b08d2ff99e87f72ac56cf6005dc Mon Sep 17 00:00:00 2001 From: Anipik Date: Thu, 13 Dec 2018 16:04:02 -0800 Subject: [PATCH 3/5] Converting to api --- test/Microsoft.ML.Benchmarks/RffTransform.cs | 34 +++++++++++++------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/test/Microsoft.ML.Benchmarks/RffTransform.cs b/test/Microsoft.ML.Benchmarks/RffTransform.cs index 52fe281d9c..adbe01b944 100644 --- a/test/Microsoft.ML.Benchmarks/RffTransform.cs +++ b/test/Microsoft.ML.Benchmarks/RffTransform.cs @@ -5,9 +5,7 @@ using BenchmarkDotNet.Attributes; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.RunTests; -using Microsoft.ML.Runtime.Tools; -using Microsoft.ML.Trainers.Online; -using Microsoft.ML.Transforms.Projections; +using Microsoft.ML.Transforms.Conversions; using System.IO; namespace Microsoft.ML.Benchmarks @@ -28,14 +26,28 @@ public void SetupTrainingSpeedTests() [Benchmark] public void CV_Multiclass_Digits_RffTransform_OVAAveragedPerceptron() { - string cmd = @"CV k=5 data={" + _dataPath_Digits + "}" + - " loader=TextLoader{col=Label:R4:64 col=Features:R4:0-63 sep=,}" + - " xf=RffTransform{col=FeaturesRFF:Features}" + - " xf=Concat{col=Features:FeaturesRFF}" + - " tr=OVA{p=AveragedPerceptron{iter=10}}"; - - var environment = EnvironmentFactory.CreateClassificationEnvironment(); - Maml.MainCore(environment, cmd, alwaysPrintStacktrace: false); + var mlContext = new MLContext(); + var reader = mlContext.Data.CreateTextReader(new TextLoader.Arguments + { + Column = new[] + { + new TextLoader.Column("Label", DataKind.R4, 64), + new TextLoader.Column("Features", DataKind.R4, new [] { new TextLoader.Range() { Min = 0, Max = 63 }}) + }, + HasHeader = false, + Separator = "," + }); + + var data = reader.Read(_dataPath_Digits); + var cachedTrainData = mlContext.Data.Cache(data); + + var pipeline = mlContext.Transforms.Projection.CreateRandomFourierFeatures("Features", "FeaturesRFF") + .Append(mlContext.Transforms.Concatenate("Features", "FeaturesRFF")) + .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(mlContext.BinaryClassification.Trainers.AveragedPerceptron(numIterations: 10))); + + var cvResults = mlContext.MulticlassClassification.CrossValidate(cachedTrainData, pipeline, numFolds: 5); } } } From a59a1e70b180dc632c28efe192e7a787a2d7f8a7 Mon Sep 17 00:00:00 2001 From: Anipik Date: Thu, 13 Dec 2018 16:52:45 -0800 Subject: [PATCH 4/5] adding cacheCheck point to correct place --- test/Microsoft.ML.Benchmarks/RffTransform.cs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/Microsoft.ML.Benchmarks/RffTransform.cs b/test/Microsoft.ML.Benchmarks/RffTransform.cs index adbe01b944..eb04b10094 100644 --- a/test/Microsoft.ML.Benchmarks/RffTransform.cs +++ b/test/Microsoft.ML.Benchmarks/RffTransform.cs @@ -39,15 +39,14 @@ public void CV_Multiclass_Digits_RffTransform_OVAAveragedPerceptron() }); var data = reader.Read(_dataPath_Digits); - var cachedTrainData = mlContext.Data.Cache(data); var pipeline = mlContext.Transforms.Projection.CreateRandomFourierFeatures("Features", "FeaturesRFF") .Append(mlContext.Transforms.Concatenate("Features", "FeaturesRFF")) - .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) .AppendCacheCheckpoint(mlContext) + .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) .Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(mlContext.BinaryClassification.Trainers.AveragedPerceptron(numIterations: 10))); - var cvResults = mlContext.MulticlassClassification.CrossValidate(cachedTrainData, pipeline, numFolds: 5); + var cvResults = mlContext.MulticlassClassification.CrossValidate(data, pipeline, numFolds: 5); } } } From 4752975d8c7c96fd5614e92e5d5ef10b9f8944c5 Mon Sep 17 00:00:00 2001 From: Anipik Date: Thu, 20 Dec 2018 15:51:26 -0800 Subject: [PATCH 5/5] comment added about dataset --- test/Microsoft.ML.Benchmarks/RffTransform.cs | 2 +- test/Microsoft.ML.TestFramework/Datasets.cs | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Benchmarks/RffTransform.cs b/test/Microsoft.ML.Benchmarks/RffTransform.cs index eb04b10094..aee30ddea8 100644 --- a/test/Microsoft.ML.Benchmarks/RffTransform.cs +++ b/test/Microsoft.ML.Benchmarks/RffTransform.cs @@ -41,8 +41,8 @@ public void CV_Multiclass_Digits_RffTransform_OVAAveragedPerceptron() var data = reader.Read(_dataPath_Digits); var pipeline = mlContext.Transforms.Projection.CreateRandomFourierFeatures("Features", "FeaturesRFF") - .Append(mlContext.Transforms.Concatenate("Features", "FeaturesRFF")) .AppendCacheCheckpoint(mlContext) + .Append(mlContext.Transforms.Concatenate("Features", "FeaturesRFF")) .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) .Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(mlContext.BinaryClassification.Trainers.AveragedPerceptron(numIterations: 10))); diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index f29c9d31cf..3a1f5df3be 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -137,6 +137,9 @@ public static class TestDatasets loaderSettings = "xf=expr{col=Features expr=x:float(x>4?1:0)}" }; + // The data set contains images of hand-written digits. + // The input is given in the form of matrix id 8x8 where + // each element is an integer in the range 0..16 public static TestDataset Digits = new TestDataset { name = "Digits",