name change for samplingKeyColumn

dotnet · Mar 18, 2019 · 822686e · 822686e
1 parent 9021a94
commit 822686e
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 37 deletions.
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs
@@ -14,7 +14,7 @@ public static void Example()
             // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
             // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
             // the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
-            var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumnName: "GroupId");
+            var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, partitionColumnName: "GroupId");
 
             // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
             var pipeline = mlContext.Ranking.Trainers.LightGbm(

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs
@@ -17,7 +17,7 @@ public static void Example()
             // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
             // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
             // the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
-            var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumnName: "GroupId");
+            var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, partitionColumnName: "GroupId");
 
             // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
             var pipeline = mlContext.Ranking.Trainers.LightGbm(

diff --git a/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
@@ -365,32 +365,32 @@ public IDataView TakeRows(IDataView input, long count)
 
         /// <summary>
         /// Split the dataset into the train set and test set according to the given fraction.
-        /// Respects the <paramref name="samplingKeyColumnName"/> if provided.
+        /// Respects the <paramref name="partitionColumnName"/> if provided.
         /// </summary>
         /// <param name="data">The dataset to split.</param>
         /// <param name="testFraction">The fraction of data to go into the test set.</param>
-        /// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
+        /// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
         /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
         /// If <see langword="null"/> no row grouping will be performed.</param>
         /// <param name="seed">Seed for the random number generator used to select rows for the train-test split.</param>
-        public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumnName = null, int? seed = null)
+        public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string partitionColumnName = null, int? seed = null)
         {
             _env.CheckValue(data, nameof(data));
             _env.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive");
-            _env.CheckValueOrNull(samplingKeyColumnName);
+            _env.CheckValueOrNull(partitionColumnName);
 
-            EnsureGroupPreservationColumn(_env, ref data, ref samplingKeyColumnName, seed);
+            EnsureGroupPreservationColumn(_env, ref data, ref partitionColumnName, seed);
 
             var trainFilter = new RangeFilter(_env, new RangeFilter.Options()
             {
-                Column = samplingKeyColumnName,
+                Column = partitionColumnName,
                 Min = 0,
                 Max = testFraction,
                 Complement = true
             }, data);
             var testFilter = new RangeFilter(_env, new RangeFilter.Options()
             {
-                Column = samplingKeyColumnName,
+                Column = partitionColumnName,
                 Min = 0,
                 Max = testFraction,
                 Complement = false

diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs
@@ -227,48 +227,48 @@ public BinaryClassificationMetrics EvaluateNonCalibrated(IDataView data, string
 
         /// <summary>
         /// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
-        /// and respecting <paramref name="samplingKeyColumnName"/> if provided.
+        /// and respecting <paramref name="partitionColumnName"/> if provided.
         /// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
         /// </summary>
         /// <param name="data">The data to run cross-validation on.</param>
         /// <param name="estimator">The estimator to fit.</param>
         /// <param name="numberOfFolds">Number of cross-validation folds.</param>
         /// <param name="labelColumnName">The label column (for evaluation).</param>
-        /// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
+        /// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
         /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
         /// If <see langword="null"/> no row grouping will be performed.</param>
         /// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
         /// <returns>Per-fold results: metrics, models, scored datasets.</returns>
         public CrossValidationResult<BinaryClassificationMetrics>[] CrossValidateNonCalibrated(
             IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
-            string samplingKeyColumnName = null, int? seed = null)
+            string partitionColumnName = null, int? seed = null)
         {
             Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
-            var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
+            var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
             return result.Select(x => new CrossValidationResult<BinaryClassificationMetrics>(x.Model,
                 EvaluateNonCalibrated(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
         }
 
         /// <summary>
         /// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
-        /// and respecting <paramref name="samplingKeyColumnName"/> if provided.
+        /// and respecting <paramref name="partitionColumnName"/> if provided.
         /// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
         /// </summary>
         /// <param name="data">The data to run cross-validation on.</param>
         /// <param name="estimator">The estimator to fit.</param>
         /// <param name="numberOfFolds">Number of cross-validation folds.</param>
         /// <param name="labelColumnName">The label column (for evaluation).</param>
-        /// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
+        /// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
         /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
         /// If <see langword="null"/> no row grouping will be performed.</param>
         /// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
         /// <returns>Per-fold results: metrics, models, scored datasets.</returns>
         public CrossValidationResult<CalibratedBinaryClassificationMetrics>[] CrossValidate(
             IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
-            string samplingKeyColumnName = null, int? seed = null)
+            string partitionColumnName = null, int? seed = null)
         {
             Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
-            var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
+            var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
             return result.Select(x => new CrossValidationResult<CalibratedBinaryClassificationMetrics>(x.Model,
                 Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
         }
@@ -431,23 +431,23 @@ public ClusteringMetrics Evaluate(IDataView data,
 
         /// <summary>
         /// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
-        /// and respecting <paramref name="samplingKeyColumnName"/> if provided.
+        /// and respecting <paramref name="partitionColumnName"/> if provided.
         /// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
         /// </summary>
         /// <param name="data">The data to run cross-validation on.</param>
         /// <param name="estimator">The estimator to fit.</param>
         /// <param name="numberOfFolds">Number of cross-validation folds.</param>
         /// <param name="labelColumnName">Optional label column for evaluation (clustering tasks may not always have a label).</param>
         /// <param name="featuresColumnName">Optional features column for evaluation (needed for calculating Dbi metric)</param>
-        /// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
+        /// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
         /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
         /// If <see langword="null"/> no row grouping will be performed.</param>
         /// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
         public CrossValidationResult<ClusteringMetrics>[] CrossValidate(
             IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = null, string featuresColumnName = null,
-            string samplingKeyColumnName = null, int? seed = null)
+            string partitionColumnName = null, int? seed = null)
         {
-            var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
+            var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
             return result.Select(x => new CrossValidationResult<ClusteringMetrics>(x.Model,
                 Evaluate(x.Scores, labelColumnName: labelColumnName, featureColumnName: featuresColumnName), x.Scores, x.Fold)).ToArray();
         }
@@ -505,25 +505,25 @@ public MulticlassClassificationMetrics Evaluate(IDataView data, string labelColu
 
         /// <summary>
         /// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
-        /// and respecting <paramref name="samplingKeyColumnName"/> if provided.
+        /// and respecting <paramref name="partitionColumnName"/> if provided.
         /// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
         /// </summary>
         /// <param name="data">The data to run cross-validation on.</param>
         /// <param name="estimator">The estimator to fit.</param>
         /// <param name="numberOfFolds">Number of cross-validation folds.</param>
         /// <param name="labelColumnName">The label column (for evaluation).</param>
-        /// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
+        /// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
         /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
         /// If <see langword="null"/> no row grouping will be performed.</param>
         /// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
         /// <returns>Per-fold results: metrics, models, scored datasets.</returns>
         /// <returns>Per-fold results: metrics, models, scored datasets.</returns>
         public CrossValidationResult<MulticlassClassificationMetrics>[] CrossValidate(
             IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
-            string samplingKeyColumnName = null, int? seed = null)
+            string partitionColumnName = null, int? seed = null)
         {
             Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
-            var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
+            var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
             return result.Select(x => new CrossValidationResult<MulticlassClassificationMetrics>(x.Model,
                 Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
         }
@@ -572,24 +572,24 @@ public RegressionMetrics Evaluate(IDataView data, string labelColumnName = Defau
 
         /// <summary>
         /// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
-        /// and respecting <paramref name="samplingKeyColumnName"/> if provided.
+        /// and respecting <paramref name="partitionColumnName"/> if provided.
         /// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
         /// </summary>
         /// <param name="data">The data to run cross-validation on.</param>
         /// <param name="estimator">The estimator to fit.</param>
         /// <param name="numberOfFolds">Number of cross-validation folds.</param>
         /// <param name="labelColumnName">The label column (for evaluation).</param>
-        /// <param name="samplingKeyColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>,
+        /// <param name="partitionColumnName">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="partitionColumnName"/>,
         /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
         /// If <see langword="null"/> no row grouping will be performed.</param>
         /// <param name="seed">Seed for the random number generator used to select rows for cross-validation folds.</param>
         /// <returns>Per-fold results: metrics, models, scored datasets.</returns>
         public CrossValidationResult<RegressionMetrics>[] CrossValidate(
             IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
-            string samplingKeyColumnName = null, int? seed = null)
+            string partitionColumnName = null, int? seed = null)
         {
             Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
-            var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
+            var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
             return result.Select(x => new CrossValidationResult<RegressionMetrics>(x.Model,
                 Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
         }

diff --git a/src/Microsoft.ML.Recommender/RecommenderCatalog.cs b/src/Microsoft.ML.Recommender/RecommenderCatalog.cs
@@ -112,26 +112,26 @@ public RegressionMetrics Evaluate(IDataView data, string labelColumnName = Defau
 
         /// <summary>
         /// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
-        /// and respecting <paramref name="samplingKeyColumnName"/> if provided.
+        /// and respecting <paramref name="partitionColumnName"/> if provided.
         /// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
         /// </summary>
         /// <param name="data">The data to run cross-validation on.</param>
         /// <param name="estimator">The estimator to fit.</param>
         /// <param name="numberOfFolds">Number of cross-validation folds.</param>
         /// <param name="labelColumnName">The label column (for evaluation).</param>
-        /// <param name="samplingKeyColumnName">Optional name of the column to use as a stratification column. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>
+        /// <param name="partitionColumnName">Optional name of the column to use as a stratification column. If two examples share the same value of the <paramref name="partitionColumnName"/>
         /// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set.
         /// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers .</param>
-        /// <param name="seed">Optional parameter used in combination with the <paramref name="samplingKeyColumnName"/>.
-        /// If the <paramref name="samplingKeyColumnName"/> is not provided, the random numbers generated to create it, will use this seed as value.
+        /// <param name="seed">Optional parameter used in combination with the <paramref name="partitionColumnName"/>.
+        /// If the <paramref name="partitionColumnName"/> is not provided, the random numbers generated to create it, will use this seed as value.
         /// And if it is not provided, the default value will be used.</param>
         /// <returns>Per-fold results: metrics, models, scored datasets.</returns>
         public CrossValidationResult<RegressionMetrics>[] CrossValidate(
             IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
-            string samplingKeyColumnName = null, int? seed = null)
+            string partitionColumnName = null, int? seed = null)
         {
             Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
-            var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
+            var result = CrossValidateTrain(data, estimator, numberOfFolds, partitionColumnName, seed);
             return result.Select(x => new CrossValidationResult<RegressionMetrics>(x.Model, Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
         }
     }