Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.proj
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@

<ItemGroup Condition="'$(IncludeBenchmarkData)' == 'true'">
<BenchmarkFile Update="@(BenchmarkFile)">
<Url>https://aka.ms/tlc-resources/benchmarks/%(Identity)</Url>
<Url>https://aka.ms/mlnet-resources/benchmarks/%(Identity)</Url>
<DestinationFile>$(MSBuildThisFileDirectory)/test/data/external/%(Identity)</DestinationFile>
</BenchmarkFile>

Expand Down
1 change: 1 addition & 0 deletions build/ExternalBenchmarkDataFiles.props
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<Project>
<ItemGroup>
<BenchmarkFile Include="digits.csv" />
<BenchmarkFile Include="MSLRWeb10KTest240kRows.tsv" />
<BenchmarkFile Include="MSLRWeb10KTrain720kRows.tsv" />
<BenchmarkFile Include="MSLRWeb10KValidate240kRows.tsv" />
Expand Down
52 changes: 52 additions & 0 deletions test/Microsoft.ML.Benchmarks/RffTransform.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using BenchmarkDotNet.Attributes;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.RunTests;
using Microsoft.ML.Transforms.Conversions;
using System.IO;

namespace Microsoft.ML.Benchmarks
{
public class RffTransformTrain
{
private string _dataPath_Digits;

[GlobalSetup]
public void SetupTrainingSpeedTests()
{
_dataPath_Digits = Path.GetFullPath(TestDatasets.Digits.trainFilename);

if (!File.Exists(_dataPath_Digits))
throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _dataPath_Digits));
}

[Benchmark]
public void CV_Multiclass_Digits_RffTransform_OVAAveragedPerceptron()
{
var mlContext = new MLContext();
var reader = mlContext.Data.CreateTextReader(new TextLoader.Arguments
{
Column = new[]
{
new TextLoader.Column("Label", DataKind.R4, 64),
new TextLoader.Column("Features", DataKind.R4, new [] { new TextLoader.Range() { Min = 0, Max = 63 }})
},
HasHeader = false,
Separator = ","
});

var data = reader.Read(_dataPath_Digits);

var pipeline = mlContext.Transforms.Projection.CreateRandomFourierFeatures("Features", "FeaturesRFF")
.AppendCacheCheckpoint(mlContext)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AppendCacheCheckpoint [](start = 13, length = 21)

Did you try to play with swapping lines 45 and 46? I believe this may be how our auto-caching used to work.

Also, this might expose the need to specify 'prefetchColumns' to avoid overly lazy caching. See the Cache method in DataOperations.cs line 35

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you try to play with swapping lines 45 and 46? I believe this may be how our auto-caching used to work.

I tried this but still getting the same result.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, this might expose the need to specify 'prefetchColumns' to avoid overly lazy caching. See the Cache method in DataOperations.cs line 35

how can we use this in this benchmark ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Zruty0: Do we have documentation on how/where/when to do caching?

.Append(mlContext.Transforms.Concatenate("Features", "FeaturesRFF"))
.Append(new ValueToKeyMappingEstimator(mlContext, "Label"))
.Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(mlContext.BinaryClassification.Trainers.AveragedPerceptron(numIterations: 10)));

var cvResults = mlContext.MulticlassClassification.CrossValidate(data, pipeline, numFolds: 5);
}
}
}
9 changes: 9 additions & 0 deletions test/Microsoft.ML.TestFramework/Datasets.cs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,15 @@ public static class TestDatasets
loaderSettings = "xf=expr{col=Features expr=x:float(x>4?1:0)}"
};

// The data set contains images of hand-written digits.
// The input is given in the form of matrix id 8x8 where
// each element is an integer in the range 0..16
public static TestDataset Digits = new TestDataset
{
name = "Digits",
trainFilename = @"external/digits.csv",
};

public static TestDataset vw = new TestDataset
{
name = "vw",
Expand Down
6 changes: 6 additions & 0 deletions test/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ The datasets are provided under the original terms that Microsoft received such
>
>Original readme: https://meta.wikimedia.org/wiki/Research:Detox

### Digits
> This dataset is provided under http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits.
>
> References: C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their Applications to Handwritten Digit Recognition, MSc Thesis, Institute of Graduate Studies in Science and Engineering, Bogazici University.
> E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.

### UCI Adult Dataset

>Dua, D. and Karra Taniskidou, E. (2017). UCI Machine Learning Repository [https://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
Expand Down