Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Creation of components through MLContext, internalization, and renaming #2510

Merged
merged 7 commits into from Feb 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -0,0 +1,67 @@
using System;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;

namespace Microsoft.ML.Samples.Dynamic
{
public class PriorTrainerSample
{
public static void Example()
{
// Downloading the dataset from github.com/dotnet/machinelearning.
// This will create a sentiment.tsv file in the filesystem.
// You can open this file, if you want to see the data.
string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset();

// A preview of the data.
// Sentiment SentimentText
// 0 " :Erm, thank you. "
// 1 ==You're cool==

// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext();

// Step 1: Read the data as an IDataView.
// First, we define the reader: specify the data columns and where to find them in the text file.
var reader = mlContext.Data.CreateTextLoader(
columns: new[]
{
new TextLoader.Column("Sentiment", DataKind.R4, 0),
new TextLoader.Column("SentimentText", DataKind.Text, 1)
},
hasHeader: true
);

// Read the data
var data = reader.Read(dataFile);

// Split it between training and test data
var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data);

// Step 2: Pipeline
// Featurize the text column through the FeaturizeText API.
// Then append a binary classifier, setting the "Label" column as the label of the dataset, and
// the "Features" column produced by FeaturizeText as the features column.
var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
.AppendCacheCheckpoint(mlContext) // Add a data-cache step within a pipeline.
.Append(mlContext.BinaryClassification.Trainers.Prior(labelColumn: "Sentiment"));

// Step 3: Train the pipeline
var trainedPipeline = pipeline.Fit(trainTestData.TrainSet);

// Step 4: Evaluate on the test set
var transformedData = trainedPipeline.Transform(trainTestData.TestSet);
var evalMetrics = mlContext.BinaryClassification.Evaluate(transformedData, label: "Sentiment");

// Step 5: Inspect the output
Console.WriteLine("Accuracy: " + evalMetrics.Accuracy);

// The Prior trainer outputs the proportion of a label in the dataset as the probability of that label.
// In this case it means that there is a split of around 64%-36% of positive and negative labels in the dataset.
// Expected output:
artidoro marked this conversation as resolved.
Show resolved Hide resolved
// Accuracy: 0.647058823529412
}
}
}
@@ -0,0 +1,67 @@
using System;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;

namespace Microsoft.ML.Samples.Dynamic
{
public class RandomTrainerSample
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RandomTrainerSample [](start = 17, length = 19)

Zeeshan A, Shahab and me have PRs where we create BinaryClassification folder.
Any chance you can move this two to that folder?

{
public static void Example()
{
// Downloading the dataset from github.com/dotnet/machinelearning.
// This will create a sentiment.tsv file in the filesystem.
// You can open this file, if you want to see the data.
string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset();

// A preview of the data.
// Sentiment SentimentText
// 0 " :Erm, thank you. "
// 1 ==You're cool==

// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext(seed: 1);

// Step 1: Read the data as an IDataView.
// First, we define the reader: specify the data columns and where to find them in the text file.
var reader = mlContext.Data.CreateTextLoader(
columns: new[]
{
new TextLoader.Column("Sentiment", DataKind.R4, 0),
new TextLoader.Column("SentimentText", DataKind.Text, 1)
},
hasHeader: true
);

// Read the data
var data = reader.Read(dataFile);

// Split it between training and test data
var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data);

// Step 2: Pipeline
// Featurize the text column through the FeaturizeText API.
// Then append a binary classifier, setting the "Label" column as the label of the dataset, and
// the "Features" column produced by FeaturizeText as the features column.
var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
.AppendCacheCheckpoint(mlContext) // Add a data-cache step within a pipeline.
.Append(mlContext.BinaryClassification.Trainers.Random());

// Step 3: Train the pipeline
var trainedPipeline = pipeline.Fit(trainTestData.TrainSet);

// Step 4: Evaluate on the test set
var transformedData = trainedPipeline.Transform(trainTestData.TestSet);
var evalMetrics = mlContext.BinaryClassification.Evaluate(transformedData, label: "Sentiment");

// Step 5: Inspect the output
Console.WriteLine("Accuracy: " + evalMetrics.Accuracy);

// We expect an output probability closet to 0.5 as the Random trainer outputs a random prediction.
// Regardless of the input features, the trainer will predict either positive or negative label with equal probability.
// Expected output (close to 0.5):
artidoro marked this conversation as resolved.
Show resolved Hide resolved
// Accuracy: 0.588235294117647
}
}
}
@@ -0,0 +1,70 @@
using System;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic
{
public class CustomMappingSample
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable and convert it to an IDataView.
var data = SamplesUtils.DatasetUtils.GetInfertData();
var trainData = mlContext.Data.ReadFromEnumerable(data);

// Preview of the data.
//
// Age Case Education Induced Parity PooledStratum RowNum ...
// 26 1 0-5yrs 1 6 3 1 ...
// 42 1 0-5yrs 1 1 1 2 ...
// 39 1 0-5yrs 2 6 4 3 ...
// 34 1 0-5yrs 2 4 2 4 ...
// 35 1 6-11yrs 1 3 32 5 ...

// We define the custom mapping between input and output rows that will be applied by the transformation.
Action<SamplesUtils.DatasetUtils.SampleInfertData, SampleInfertDataTransformed> mapping =
(input, output) => output.IsUnderThirty = input.Age < 30;

// Custom transformations can be used to transform data directly, or as part of a pipeline. Below we transform data directly.
var transformer = mlContext.Transforms.CustomMappingTransformer(mapping, null);
var transformedData = transformer.Transform(trainData);

// Preview of the data.
//
// IsUnderThirty Age Case Education Induced Parity PooledStratum RowNum ...
// true 26 1 0-5yrs 1 6 3 1 ...
// false 42 1 0-5yrs 1 1 1 2 ...
// false 39 1 0-5yrs 2 6 4 3 ...
// false 34 1 0-5yrs 2 4 2 4 ...
// false 35 1 6-11yrs 1 3 32 5 ...

// Here instead we use it as part of a pipeline of estimators.
var pipeline = mlContext.Transforms.CustomMapping(mapping, null)
.Append(mlContext.Transforms.Concatenate(outputColumnName: "Features", inputColumnNames: new[] { "Parity", "Induced" }))
// It is useful to add a caching checkpoint before a trainer that does several passes over the data.
.AppendCacheCheckpoint(mlContext)
// We use binary FastTree to predict the label column that was generated by the custom mapping at the first step of the pipeline.
.Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumn: "IsUnderThirty"));

// We can train the pipeline and use it to transform data.
transformedData = pipeline.Fit(trainData).Transform(trainData);
}

// Represents the transformed infertility dataset.
public class SampleInfertDataTransformed
{
public int RowNum { get; set; }
public string Education { get; set; }
public bool IsUnderThirty { get; set; }
public float Parity { get; set; }
public float Induced { get; set; }
public float Case { get; set; }
public float Spontaneous { get; set; }
public float Stratum { get; set; }
public float PooledStratum { get; set; }
}
}
}
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Expand Up @@ -905,7 +905,7 @@ private Schema ComputeOutputSchema()

internal const string Summary = "Loads text data file.";

public const string LoaderSignature = "TextLoader";
internal const string LoaderSignature = "TextLoader";

private const uint VerForceVectorSupported = 0x0001000A;
private const uint VersionNoMinCount = 0x0001000C;
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/DataLoadSave/TransformWrapper.cs
Expand Up @@ -20,7 +20,7 @@ namespace Microsoft.ML.Data
// It needs to become internal.
public sealed class TransformWrapper : ITransformer
{
public const string LoaderSignature = "TransformWrapper";
internal const string LoaderSignature = "TransformWrapper";
private const string TransformDirTemplate = "Step_{0:000}";

private readonly IHost _host;
Expand Down
12 changes: 8 additions & 4 deletions src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs
Expand Up @@ -54,6 +54,10 @@ public abstract class PredictionTransformerBase<TModel> : IPredictionTransformer
private protected ISchemaBindableMapper BindableMapper;
protected Schema TrainSchema;

/// <summary>
/// Whether a call to <see cref="GetRowToRowMapper(Schema)"/> should succeed, on an
/// appropriate schema.
/// </summary>
public bool IsRowToRowMapper => true;

/// <summary>
Expand Down Expand Up @@ -257,8 +261,8 @@ private protected GenericScorer GetGenericScorer()
public sealed class AnomalyPredictionTransformer<TModel> : SingleFeaturePredictionTransformerBase<TModel>
where TModel : class
{
public readonly string ThresholdColumn;
public readonly float Threshold;
internal readonly string ThresholdColumn;
internal readonly float Threshold;

[BestFriend]
internal AnomalyPredictionTransformer(IHostEnvironment env, TModel model, Schema inputSchema, string featureColumn,
Expand Down Expand Up @@ -326,8 +330,8 @@ private static VersionInfo GetVersionInfo()
public sealed class BinaryPredictionTransformer<TModel> : SingleFeaturePredictionTransformerBase<TModel>
where TModel : class
{
public readonly string ThresholdColumn;
public readonly float Threshold;
internal readonly string ThresholdColumn;
internal readonly float Threshold;

[BestFriend]
internal BinaryPredictionTransformer(IHostEnvironment env, TModel model, Schema inputSchema, string featureColumn,
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.FastTree/FastTreeRegression.cs
Expand Up @@ -36,7 +36,7 @@ namespace Microsoft.ML.Trainers.FastTree
public sealed partial class FastTreeRegressionTrainer
: BoostingFastTreeTrainerBase<FastTreeRegressionTrainer.Options, RegressionPredictionTransformer<FastTreeRegressionModelParameters>, FastTreeRegressionModelParameters>
{
public const string LoadNameValue = "FastTreeRegression";
internal const string LoadNameValue = "FastTreeRegression";
internal const string UserNameValue = "FastTree (Boosted Trees) Regression";
internal const string Summary = "Trains gradient boosted decision trees to fit target values using least-squares.";
internal const string ShortName = "ftr";
Expand Down Expand Up @@ -142,7 +142,7 @@ private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm(
/// </summary>
/// <param name="set">The dataset</param>
/// <returns>The list of regression targets, or null if <paramref name="set"/> was null</returns>
public static float[] GetDatasetRegressionLabels(Dataset set)
internal static float[] GetDatasetRegressionLabels(Dataset set)
{
if (set == null)
return null;
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.FastTree/FastTreeTweedie.cs
Expand Up @@ -149,7 +149,7 @@ private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm(
/// </summary>
/// <param name="set">The dataset</param>
/// <returns>The list of regression targets, or null if <paramref name="set"/> was null</returns>
public static float[] GetDatasetRegressionLabels(Dataset set)
internal static float[] GetDatasetRegressionLabels(Dataset set)
{
if (set == null)
return null;
Expand Down
6 changes: 3 additions & 3 deletions src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
Expand Up @@ -75,9 +75,9 @@ private static LightGbmRankingModelParameters Create(IHostEnvironment env, Model
/// <include file='doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
public sealed class LightGbmRankingTrainer : LightGbmTrainerBase<float, RankingPredictionTransformer<LightGbmRankingModelParameters>, LightGbmRankingModelParameters>
{
public const string UserName = "LightGBM Ranking";
public const string LoadNameValue = "LightGBMRanking";
public const string ShortName = "LightGBMRank";
internal const string UserName = "LightGBM Ranking";
internal const string LoadNameValue = "LightGBMRanking";
internal const string ShortName = "LightGBMRank";

public override PredictionKind PredictionKind => PredictionKind.Ranking;

Expand Down
Expand Up @@ -61,7 +61,7 @@ public sealed class DnnImageFeaturizerEstimator : IEstimator<TransformerChain<Co
/// For an example, see Microsoft.ML.DnnImageFeaturizer.ResNet18 </param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
public DnnImageFeaturizerEstimator(IHostEnvironment env, string outputColumnName, Func<DnnImageFeaturizerInput, EstimatorChain<ColumnCopyingTransformer>> modelFactory, string inputColumnName = null)
internal DnnImageFeaturizerEstimator(IHostEnvironment env, string outputColumnName, Func<DnnImageFeaturizerInput, EstimatorChain<ColumnCopyingTransformer>> modelFactory, string inputColumnName = null)
{
_modelChain = modelFactory(new DnnImageFeaturizerInput(outputColumnName, inputColumnName ?? outputColumnName, env, new DnnImageModelSelector()));
}
Expand Down
15 changes: 15 additions & 0 deletions src/Microsoft.ML.OnnxTransform/OnnxCatalog.cs
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

Expand Down Expand Up @@ -61,5 +62,19 @@ public static class OnnxCatalog
bool fallbackToCpu = false)
=> new OnnxScoringEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnNames, inputColumnNames, modelFile, gpuDeviceId, fallbackToCpu);

/// <summary>
/// Creates a new instance of <see cref="DnnImageFeaturizerEstimator"/> which applies a pre-trained DNN model to featurize an image.
/// </summary>
/// <param name="catalog">The transform's catalog.</param>
/// <param name="outputColumnName">The name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="modelFactory">An extension method on the <see cref="DnnImageModelSelector"/> that creates a chain of two
/// <see cref="OnnxScoringEstimator"/> (one for preprocessing and one with a pretrained image DNN) with specific models
/// included in a package together with that extension method.</param>
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
public static DnnImageFeaturizerEstimator DnnFeaturizeImage(this TransformsCatalog catalog,
artidoro marked this conversation as resolved.
Show resolved Hide resolved
string outputColumnName,
Func<DnnImageFeaturizerInput, EstimatorChain<ColumnCopyingTransformer>> modelFactory,
string inputColumnName = null)
=> new DnnImageFeaturizerEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, modelFactory, inputColumnName);
}
}
Expand Up @@ -285,17 +285,17 @@ public float[] GetLatentWeights()

public sealed class FieldAwareFactorizationMachinePredictionTransformer : PredictionTransformerBase<FieldAwareFactorizationMachineModelParameters>
{
public const string LoaderSignature = "FAFMPredXfer";
internal const string LoaderSignature = "FAFMPredXfer";

/// <summary>
/// The name of the feature column used by the prediction transformer.
/// </summary>
public IReadOnlyList<string> FeatureColumns { get; }
internal IReadOnlyList<string> FeatureColumns { get; }

/// <summary>
/// The type of the feature columns.
/// </summary>
public IReadOnlyList<ColumnType> FeatureColumnTypes { get; }
internal IReadOnlyList<ColumnType> FeatureColumnTypes { get; }

private readonly string _thresholdColumn;
private readonly float _threshold;
Expand Down
Expand Up @@ -35,7 +35,7 @@ public sealed partial class LogisticRegression : LbfgsTrainerBase<LogisticRegres
BinaryPredictionTransformer<CalibratedModelParametersBase<LinearBinaryModelParameters, PlattCalibrator>>,
CalibratedModelParametersBase<LinearBinaryModelParameters, PlattCalibrator>>
{
public const string LoadNameValue = "LogisticRegression";
internal const string LoadNameValue = "LogisticRegression";
internal const string UserNameValue = "Logistic Regression";
internal const string ShortName = "lr";
internal const string Summary = "Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can "
Expand Down
Expand Up @@ -41,7 +41,7 @@ namespace Microsoft.ML.Trainers
public sealed class MulticlassLogisticRegression : LbfgsTrainerBase<MulticlassLogisticRegression.Options,
MulticlassPredictionTransformer<MulticlassLogisticRegressionModelParameters>, MulticlassLogisticRegressionModelParameters>
{
public const string LoadNameValue = "MultiClassLogisticRegression";
internal const string LoadNameValue = "MultiClassLogisticRegression";
internal const string UserNameValue = "Multi-class Logistic Regression";
internal const string ShortName = "mlr";

Expand Down