Skip to content

Commit

Permalink
revew comments
Browse files Browse the repository at this point in the history
  • Loading branch information
artidoro committed Mar 18, 2019
1 parent 5ef42ab commit 9021a94
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 22 deletions.
2 changes: 1 addition & 1 deletion docs/code/MlNetCookBook.md
Expand Up @@ -376,7 +376,7 @@ var testData = mlContext.Data.LoadFromTextFile<AdultData>(testDataPath,
separatorChar: ','
);
// Calculate metrics of the model on the test data.
var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: "Target");
var metrics = mlContext.Regression.Evaluate(model.Transform(testData), labelColumnName: "Target");
```

## How do I save and load the model?
Expand Down
Expand Up @@ -14,7 +14,7 @@ public static void Example()
// Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
// respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
// the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumnName: "GroupId");

// Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
var pipeline = mlContext.Ranking.Trainers.LightGbm(
Expand Down
Expand Up @@ -17,7 +17,7 @@ public static void Example()
// Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
// respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
// the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumnName: "GroupId");

// Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
var pipeline = mlContext.Ranking.Trainers.LightGbm(
Expand Down
14 changes: 7 additions & 7 deletions src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
Expand Up @@ -365,32 +365,32 @@ public IDataView TakeRows(IDataView input, long count)

/// <summary>
/// Split the dataset into the train set and test set according to the given fraction.
/// Respects the <paramref name="samplingKeyColumn"/> if provided.
/// Respects the <paramref name="samplingKeyColumnName"/> if provided.
/// </summary>
/// <param name="data">The dataset to split.</param>
/// <param name="testFraction">The fraction of data to go into the test set.</param>
/// <param name="samplingKeyColumn">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumn"/>,
/// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
/// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
/// If <see langword="null"/> no row grouping will be performed.</param>
/// <param name="seed">Seed for the random number generator used to select rows for the train-test split.</param>
public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumn = null, int? seed = null)
public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumnName = null, int? seed = null)
{
_env.CheckValue(data, nameof(data));
_env.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive");
_env.CheckValueOrNull(samplingKeyColumn);
_env.CheckValueOrNull(samplingKeyColumnName);

EnsureGroupPreservationColumn(_env, ref data, ref samplingKeyColumn, seed);
EnsureGroupPreservationColumn(_env, ref data, ref samplingKeyColumnName, seed);

var trainFilter = new RangeFilter(_env, new RangeFilter.Options()
{
Column = samplingKeyColumn,
Column = samplingKeyColumnName,
Min = 0,
Max = testFraction,
Complement = true
}, data);
var testFilter = new RangeFilter(_env, new RangeFilter.Options()
{
Column = samplingKeyColumn,
Column = samplingKeyColumnName,
Min = 0,
Max = testFraction,
Complement = false
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Data/TrainCatalog.cs
Expand Up @@ -572,24 +572,24 @@ public RegressionMetrics Evaluate(IDataView data, string labelColumnName = Defau

/// <summary>
/// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
/// and respecting <paramref name="samplingKeyColumname"/> if provided.
/// and respecting <paramref name="samplingKeyColumnName"/> if provided.
/// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
/// </summary>
/// <param name="data">The data to run cross-validation on.</param>
/// <param name="estimator">The estimator to fit.</param>
/// <param name="numberOfFolds">Number of cross-validation folds.</param>
/// <param name="labelColumnName">The label column (for evaluation).</param>
/// <param name="samplingKeyColumname">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumname"/>,
/// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
/// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
/// If <see langword="null"/> no row grouping will be performed.</param>
/// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
/// <returns>Per-fold results: metrics, models, scored datasets.</returns>
public CrossValidationResult<RegressionMetrics>[] CrossValidate(
IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
string samplingKeyColumname = null, int? seed = null)
string samplingKeyColumnName = null, int? seed = null)
{
Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumname, seed);
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
return result.Select(x => new CrossValidationResult<RegressionMetrics>(x.Model,
Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
}
Expand Down
12 changes: 6 additions & 6 deletions src/Microsoft.ML.Recommender/RecommenderCatalog.cs
Expand Up @@ -112,26 +112,26 @@ public RegressionMetrics Evaluate(IDataView data, string labelColumnName = Defau

/// <summary>
/// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
/// and respecting <paramref name="samplingKeyColumn"/> if provided.
/// and respecting <paramref name="samplingKeyColumnName"/> if provided.
/// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
/// </summary>
/// <param name="data">The data to run cross-validation on.</param>
/// <param name="estimator">The estimator to fit.</param>
/// <param name="numberOfFolds">Number of cross-validation folds.</param>
/// <param name="labelColumnName">The label column (for evaluation).</param>
/// <param name="samplingKeyColumn">Optional name of the column to use as a stratification column. If two examples share the same value of the <paramref name="samplingKeyColumn"/>
/// <param name="samplingKeyColumnName">Optional name of the column to use as a stratification column. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>
/// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set.
/// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers .</param>
/// <param name="seed">Optional parameter used in combination with the <paramref name="samplingKeyColumn"/>.
/// If the <paramref name="samplingKeyColumn"/> is not provided, the random numbers generated to create it, will use this seed as value.
/// <param name="seed">Optional parameter used in combination with the <paramref name="samplingKeyColumnName"/>.
/// If the <paramref name="samplingKeyColumnName"/> is not provided, the random numbers generated to create it, will use this seed as value.
/// And if it is not provided, the default value will be used.</param>
/// <returns>Per-fold results: metrics, models, scored datasets.</returns>
public CrossValidationResult<RegressionMetrics>[] CrossValidate(
IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
string samplingKeyColumn = null, int? seed = null)
string samplingKeyColumnName = null, int? seed = null)
{
Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumn, seed);
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
return result.Select(x => new CrossValidationResult<RegressionMetrics>(x.Model, Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
}
}
Expand Down
4 changes: 2 additions & 2 deletions test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs
Expand Up @@ -324,7 +324,7 @@ public void TestTrainTestSplit()
// Now let's do same thing but with presence of stratificationColumn.
// Rows with same values in this stratificationColumn should end up in same subset (train or test).
// So let's break dataset by "Workclass" column.
var stratSplit = mlContext.Data.TrainTestSplit(input, samplingKeyColumn: "Workclass");
var stratSplit = mlContext.Data.TrainTestSplit(input, samplingKeyColumnName: "Workclass");
var stratTrainWorkclass = getWorkclass(stratSplit.TrainSet);
var stratTestWorkClass = getWorkclass(stratSplit.TestSet);
// Let's get unique values for "Workclass" column from train subset.
Expand All @@ -336,7 +336,7 @@ public void TestTrainTestSplit()

// Let's do same thing, but this time we will choose different seed.
// Stratification column should still break dataset properly without same values in both subsets.
var stratSeed = mlContext.Data.TrainTestSplit(input, samplingKeyColumn:"Workclass", seed: 1000000);
var stratSeed = mlContext.Data.TrainTestSplit(input, samplingKeyColumnName:"Workclass", seed: 1000000);
var stratTrainWithSeedWorkclass = getWorkclass(stratSeed.TrainSet);
var stratTestWithSeedWorkClass = getWorkclass(stratSeed.TestSet);
// Let's get unique values for "Workclass" column from train subset.
Expand Down

0 comments on commit 9021a94

Please sign in to comment.