Skip to content

Commit

Permalink
name change for samplingKeyColumn
Browse files Browse the repository at this point in the history
  • Loading branch information
artidoro committed Mar 18, 2019
1 parent 9021a94 commit 822686e
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public static void Example()
// Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
// respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
// the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumnName: "GroupId");
var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, partitionColumnName: "GroupId");

// Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
var pipeline = mlContext.Ranking.Trainers.LightGbm(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public static void Example()
// Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
// respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
// the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumnName: "GroupId");
var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, partitionColumnName: "GroupId");

// Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
var pipeline = mlContext.Ranking.Trainers.LightGbm(
Expand Down
14 changes: 7 additions & 7 deletions src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -365,32 +365,32 @@ public IDataView TakeRows(IDataView input, long count)

/// <summary>
/// Split the dataset into the train set and test set according to the given fraction.
/// Respects the <paramref name="samplingKeyColumnName"/> if provided.
/// Respects the <paramref name="partitionColumnName"/> if provided.
/// </summary>
/// <param name="data">The dataset to split.</param>
/// <param name="testFraction">The fraction of data to go into the test set.</param>
/// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
/// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
/// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
/// If <see langword="null"/> no row grouping will be performed.</param>
/// <param name="seed">Seed for the random number generator used to select rows for the train-test split.</param>
public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumnName = null, int? seed = null)
public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string partitionColumnName = null, int? seed = null)
{
_env.CheckValue(data, nameof(data));
_env.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive");
_env.CheckValueOrNull(samplingKeyColumnName);
_env.CheckValueOrNull(partitionColumnName);

EnsureGroupPreservationColumn(_env, ref data, ref samplingKeyColumnName, seed);
EnsureGroupPreservationColumn(_env, ref data, ref partitionColumnName, seed);

var trainFilter = new RangeFilter(_env, new RangeFilter.Options()
{
Column = samplingKeyColumnName,
Column = partitionColumnName,
Min = 0,
Max = testFraction,
Complement = true
}, data);
var testFilter = new RangeFilter(_env, new RangeFilter.Options()
{
Column = samplingKeyColumnName,
Column = partitionColumnName,
Min = 0,
Max = testFraction,
Complement = false
Expand Down
40 changes: 20 additions & 20 deletions src/Microsoft.ML.Data/TrainCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -227,48 +227,48 @@ public BinaryClassificationMetrics EvaluateNonCalibrated(IDataView data, string

/// <summary>
/// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
/// and respecting <paramref name="samplingKeyColumnName"/> if provided.
/// and respecting <paramref name="partitionColumnName"/> if provided.
/// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
/// </summary>
/// <param name="data">The data to run cross-validation on.</param>
/// <param name="estimator">The estimator to fit.</param>
/// <param name="numberOfFolds">Number of cross-validation folds.</param>
/// <param name="labelColumnName">The label column (for evaluation).</param>
/// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
/// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
/// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
/// If <see langword="null"/> no row grouping will be performed.</param>
/// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
/// <returns>Per-fold results: metrics, models, scored datasets.</returns>
public CrossValidationResult<BinaryClassificationMetrics>[] CrossValidateNonCalibrated(
IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
string samplingKeyColumnName = null, int? seed = null)
string partitionColumnName = null, int? seed = null)
{
Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
return result.Select(x => new CrossValidationResult<BinaryClassificationMetrics>(x.Model,
EvaluateNonCalibrated(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
}

/// <summary>
/// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
/// and respecting <paramref name="samplingKeyColumnName"/> if provided.
/// and respecting <paramref name="partitionColumnName"/> if provided.
/// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
/// </summary>
/// <param name="data">The data to run cross-validation on.</param>
/// <param name="estimator">The estimator to fit.</param>
/// <param name="numberOfFolds">Number of cross-validation folds.</param>
/// <param name="labelColumnName">The label column (for evaluation).</param>
/// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
/// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
/// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
/// If <see langword="null"/> no row grouping will be performed.</param>
/// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
/// <returns>Per-fold results: metrics, models, scored datasets.</returns>
public CrossValidationResult<CalibratedBinaryClassificationMetrics>[] CrossValidate(
IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
string samplingKeyColumnName = null, int? seed = null)
string partitionColumnName = null, int? seed = null)
{
Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
return result.Select(x => new CrossValidationResult<CalibratedBinaryClassificationMetrics>(x.Model,
Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
}
Expand Down Expand Up @@ -431,23 +431,23 @@ public ClusteringMetrics Evaluate(IDataView data,

/// <summary>
/// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
/// and respecting <paramref name="samplingKeyColumnName"/> if provided.
/// and respecting <paramref name="partitionColumnName"/> if provided.
/// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
/// </summary>
/// <param name="data">The data to run cross-validation on.</param>
/// <param name="estimator">The estimator to fit.</param>
/// <param name="numberOfFolds">Number of cross-validation folds.</param>
/// <param name="labelColumnName">Optional label column for evaluation (clustering tasks may not always have a label).</param>
/// <param name="featuresColumnName">Optional features column for evaluation (needed for calculating Dbi metric)</param>
/// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
/// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
/// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
/// If <see langword="null"/> no row grouping will be performed.</param>
/// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
public CrossValidationResult<ClusteringMetrics>[] CrossValidate(
IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = null, string featuresColumnName = null,
string samplingKeyColumnName = null, int? seed = null)
string partitionColumnName = null, int? seed = null)
{
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
return result.Select(x => new CrossValidationResult<ClusteringMetrics>(x.Model,
Evaluate(x.Scores, labelColumnName: labelColumnName, featureColumnName: featuresColumnName), x.Scores, x.Fold)).ToArray();
}
Expand Down Expand Up @@ -505,25 +505,25 @@ public MulticlassClassificationMetrics Evaluate(IDataView data, string labelColu

/// <summary>
/// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
/// and respecting <paramref name="samplingKeyColumnName"/> if provided.
/// and respecting <paramref name="partitionColumnName"/> if provided.
/// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
/// </summary>
/// <param name="data">The data to run cross-validation on.</param>
/// <param name="estimator">The estimator to fit.</param>
/// <param name="numberOfFolds">Number of cross-validation folds.</param>
/// <param name="labelColumnName">The label column (for evaluation).</param>
/// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
/// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
/// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
/// If <see langword="null"/> no row grouping will be performed.</param>
/// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
/// <returns>Per-fold results: metrics, models, scored datasets.</returns>
/// <returns>Per-fold results: metrics, models, scored datasets.</returns>
public CrossValidationResult<MulticlassClassificationMetrics>[] CrossValidate(
IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
string samplingKeyColumnName = null, int? seed = null)
string partitionColumnName = null, int? seed = null)
{
Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
return result.Select(x => new CrossValidationResult<MulticlassClassificationMetrics>(x.Model,
Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
}
Expand Down Expand Up @@ -572,24 +572,24 @@ public RegressionMetrics Evaluate(IDataView data, string labelColumnName = Defau

/// <summary>
/// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
/// and respecting <paramref name="samplingKeyColumnName"/> if provided.
/// and respecting <paramref name="partitionColumnName"/> if provided.
/// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
/// </summary>
/// <param name="data">The data to run cross-validation on.</param>
/// <param name="estimator">The estimator to fit.</param>
/// <param name="numberOfFolds">Number of cross-validation folds.</param>
/// <param name="labelColumnName">The label column (for evaluation).</param>
/// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
/// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
/// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
/// If <see langword="null"/> no row grouping will be performed.</param>
/// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
/// <returns>Per-fold results: metrics, models, scored datasets.</returns>
public CrossValidationResult<RegressionMetrics>[] CrossValidate(
IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
string samplingKeyColumnName = null, int? seed = null)
string partitionColumnName = null, int? seed = null)
{
Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
return result.Select(x => new CrossValidationResult<RegressionMetrics>(x.Model,
Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
}
Expand Down
12 changes: 6 additions & 6 deletions src/Microsoft.ML.Recommender/RecommenderCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,26 +112,26 @@ public RegressionMetrics Evaluate(IDataView data, string labelColumnName = Defau

/// <summary>
/// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
/// and respecting <paramref name="samplingKeyColumnName"/> if provided.
/// and respecting <paramref name="partitionColumnName"/> if provided.
/// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
/// </summary>
/// <param name="data">The data to run cross-validation on.</param>
/// <param name="estimator">The estimator to fit.</param>
/// <param name="numberOfFolds">Number of cross-validation folds.</param>
/// <param name="labelColumnName">The label column (for evaluation).</param>
/// <param name="samplingKeyColumnName">Optional name of the column to use as a stratification column. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>
/// <param name="partitionColumnName">Optional name of the column to use as a stratification column. If two examples share the same value of the <paramref name="partitionColumnName"/>
/// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set.
/// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers .</param>
/// <param name="seed">Optional parameter used in combination with the <paramref name="samplingKeyColumnName"/>.
/// If the <paramref name="samplingKeyColumnName"/> is not provided, the random numbers generated to create it, will use this seed as value.
/// <param name="seed">Optional parameter used in combination with the <paramref name="partitionColumnName"/>.
/// If the <paramref name="partitionColumnName"/> is not provided, the random numbers generated to create it, will use this seed as value.
/// And if it is not provided, the default value will be used.</param>
/// <returns>Per-fold results: metrics, models, scored datasets.</returns>
public CrossValidationResult<RegressionMetrics>[] CrossValidate(
IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
string samplingKeyColumnName = null, int? seed = null)
string partitionColumnName = null, int? seed = null)
{
Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
return result.Select(x => new CrossValidationResult<RegressionMetrics>(x.Model, Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
}
}
Expand Down
Loading

0 comments on commit 822686e

Please sign in to comment.