deterministic repartitioning

dmlc · Sep 4, 2019 · 155a30d · 155a30d
1 parent 573bb12
commit 155a30d
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 8 deletions.
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
@@ -122,6 +122,25 @@ object DataUtils extends Serializable {
     }
   }
 
+  private def repartitionRDDs(
+      deterministicPartition: Boolean,
+      numWorkers: Int,
+      arrayOfRDDs: Array[RDD[(Int, XGBLabeledPoint)]]): Array[RDD[XGBLabeledPoint]] = {
+    if (deterministicPartition) {
+      arrayOfRDDs.map {rdd => rdd.partitionBy(new HashPartitioner(numWorkers))}.map {
+        rdd => rdd.map(_._2)
+      }
+    } else {
+      arrayOfRDDs.map(rdd => {
+        if (rdd.getNumPartitions != numWorkers) {
+          rdd.map(_._2).repartition(numWorkers)
+        } else {
+          rdd.map(_._2)
+        }
+      })
+    }
+  }
+
   private[spark] def convertDataFrameToXGBLabeledPointRDDs(
       labelCol: Column,
       featuresCol: Column,
@@ -158,13 +177,7 @@ object DataUtils extends Serializable {
           attachPartitionKey(row, deterministicPartition, numWorkers, xgbLp)
       }
     }
-    if (deterministicPartition) {
-      arrayOfRDDs.map {rdd => rdd.partitionBy(new HashPartitioner(numWorkers))}.map {
-        rdd => rdd.map(_._2)
-      }
-    } else {
-      arrayOfRDDs.map(rdd => rdd.map(_._2).repartition(numWorkers))
-    }
+    repartitionRDDs(deterministicPartition, numWorkers, arrayOfRDDs)
   }
 
 }
diff --git a/...xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorCommon.scala b/...xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorCommon.scala
@@ -23,7 +23,7 @@ import org.apache.spark.ml.param.shared.HasWeightCol
 private[spark] sealed trait XGBoostEstimatorCommon extends GeneralParams with LearningTaskParams
   with BoosterParams with ParamMapFuncs with NonParamVariables {
 
-  protected def needDeterministicRepartitioning: Boolean = {
+  def needDeterministicRepartitioning: Boolean = {
     getCheckpointPath.nonEmpty && getCheckpointInterval > 0
   }
 }

diff --git a/...j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala b/...j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
@@ -0,0 +1,82 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.sql.functions._
+
+class DeterministicPartitioningSuite extends FunSuite with TmpFolderPerSuite with PerTest {
+
+  test("perform deterministic partitioning when checkpointInternal and" +
+    " checkpointPath is set (Classifier)") {
+    val tmpPath = createTmpFolder("model1").toAbsolutePath.toString
+    val paramMap = Map("eta" -> "1", "max_depth" -> 2,
+      "objective" -> "binary:logistic", "checkpoint_path" -> tmpPath,
+      "checkpoint_interval" -> 2, "num_workers" -> numWorkers)
+    val xgbClassifier = new XGBoostClassifier(paramMap)
+    assert(xgbClassifier.needDeterministicRepartitioning)
+  }
+
+  test("perform deterministic partitioning when checkpointInternal and" +
+    " checkpointPath is set (Regressor)") {
+    val tmpPath = createTmpFolder("model1").toAbsolutePath.toString
+    val paramMap = Map("eta" -> "1", "max_depth" -> 2,
+      "objective" -> "binary:logistic", "checkpoint_path" -> tmpPath,
+      "checkpoint_interval" -> 2, "num_workers" -> numWorkers)
+    val xgbRegressor = new XGBoostRegressor(paramMap)
+    assert(xgbRegressor.needDeterministicRepartitioning)
+  }
+
+  test("deterministic partitioning takes effect with various parts of data") {
+    val trainingDF = buildDataFrame(Classification.train)
+    // the test idea is that, we apply a chain of repartitions over trainingDFs but they
+    // have to produce the identical RDDs
+    val transformedDFs = (1 until 6).map(shuffleCount => {
+      var resultDF = trainingDF
+      for (i <- 0 until shuffleCount) {
+        resultDF = resultDF.repartition(numWorkers)
+      }
+      resultDF
+    })
+    val transformedRDDs = transformedDFs.map(df => DataUtils.convertDataFrameToXGBLabeledPointRDDs(
+      col("label"),
+      col("features"),
+      lit(1.0),
+      lit(Float.NaN),
+      None,
+      numWorkers,
+      deterministicPartition = true,
+      df
+    ).head)
+    val resultsMaps = transformedRDDs.map(rdd => rdd.mapPartitionsWithIndex {
+      case (partitionIndex, labelPoints) =>
+        Iterator((partitionIndex, labelPoints.toList))
+    }.collect().toMap)
+    resultsMaps.foldLeft(resultsMaps.head) { case (map1, map2) =>
+      assert(map1.keys.toSet === map2.keys.toSet)
+      for ((parIdx, labeledPoints) <- map1) {
+        val sortedA = labeledPoints.sortBy(_.hashCode())
+        val sortedB = map2(parIdx).sortBy(_.hashCode())
+        assert(sortedA.length === sortedB.length)
+        assert(sortedA.indices.forall(idx =>
+          sortedA(idx).values.toSet === sortedB(idx).values.toSet))
+      }
+      map2
+    }
+  }
+}