Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SW-2449] asH2OFrame Method Could Fail on a String Column Having More…
… Than 10 Million Distinct Values (#2341) * [SW-2449] asH2OFrame Method Could Fail on a String Column Having More Than 10 Million Distinct Values * spotlessApply * Fix SupportedRDDConverterTestSuite tests * Revert changes in tests * Fix OOM in tests * Escape names * Move the whole conversion logic to H2O backend * Use ExpectedType.Categorical * typo * spotlessApply * Remove DataTypeConverterTestSuite * fix DataFrameConverterTestSuite * fix calculation of the ratio * fix ConvertCategoricalToStringColumnsTask * fix empty frames * conversion logic to separate methods * Add more tests * condition for unique columns * Add tests on one partition * Use PreviewParseWriter * spotless * fix categorical preview writer * remove irrelevant column * Virtual ice hash map * spotlessApply * Adding DKV.put and disabling tests with big datasets on external backend * spotlessApply * change test for external backend in test
- Loading branch information
Showing
12 changed files
with
441 additions
and
204 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
64 changes: 0 additions & 64 deletions
64
core/src/main/scala/ai/h2o/sparkling/backend/converters/CategoricalPreviewWriter.java
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
130 changes: 130 additions & 0 deletions
130
...st/scala/ai/h2o/sparkling/backend/converters/DataFrameConverterCategoricalTestSuite.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package ai.h2o.sparkling.backend.converters | ||
|
||
import ai.h2o.sparkling.ml.utils.SchemaUtils | ||
import ai.h2o.sparkling.{H2OFrame, SharedH2OTestContext, TestUtils} | ||
import org.apache.spark.sql.types.{StringType, StructField} | ||
import org.apache.spark.sql.{DataFrame, SparkSession} | ||
import org.junit.runner.RunWith | ||
import org.scalatest.junit.JUnitRunner | ||
import org.scalatest.FunSuite | ||
import water.parser.Categorical | ||
|
||
@RunWith(classOf[JUnitRunner]) | ||
class DataFrameConverterCategoricalTestSuite extends FunSuite with SharedH2OTestContext { | ||
|
||
override def createSparkSession(): SparkSession = sparkSession("local[*]") | ||
import spark.implicits._ | ||
|
||
test("PUBDEV-766 H2OFrame[T_ENUM] to DataFrame[StringType]") { | ||
val df = spark.sparkContext.parallelize(Array("ONE", "ZERO", "ZERO", "ONE")).toDF("C0") | ||
val h2oFrame = hc.asH2OFrame(df) | ||
h2oFrame.convertColumnsToCategorical(Array(0)) | ||
assert(h2oFrame.columns(0).isCategorical()) | ||
|
||
val dataFrame = hc.asSparkFrame(h2oFrame) | ||
assert(dataFrame.count == h2oFrame.numberOfRows) | ||
assert(dataFrame.take(4)(3)(0) == "ONE") | ||
assert(dataFrame.schema.fields(0) match { | ||
case StructField("C0", StringType, false, _) => true | ||
case _ => false | ||
}) | ||
|
||
h2oFrame.delete() | ||
} | ||
|
||
test("DataFrame[String] to H2OFrame[T_STRING] and back") { | ||
val df = Seq("one", "two", "three", "four", "five", "six", "seven").toDF("Strings").repartition(3) | ||
val h2oFrame = hc.asH2OFrame(df) | ||
|
||
assertH2OFrameInvariants(df, h2oFrame) | ||
assert(h2oFrame.columns(0).isString()) | ||
|
||
val resultDF = hc.asSparkFrame(h2oFrame) | ||
TestUtils.assertDataFramesAreIdentical(df, resultDF) | ||
h2oFrame.delete() | ||
} | ||
|
||
test("DataFrame[String] to H2OFrame[T_CAT] and back") { | ||
val df = Seq("one", "two", "three", "one", "two", "three", "one").toDF("Strings").repartition(3) | ||
val h2oFrame = hc.asH2OFrame(df) | ||
|
||
assertH2OFrameInvariants(df, h2oFrame) | ||
assert(h2oFrame.columns(0).isCategorical()) | ||
|
||
val resultDF = hc.asSparkFrame(h2oFrame) | ||
TestUtils.assertDataFramesAreIdentical(df, resultDF) | ||
h2oFrame.delete() | ||
} | ||
|
||
// External backed can go OOM in testing docker image | ||
if (sys.props.getOrElse("spark.ext.h2o.backend.cluster.mode", "internal") == "internal") { | ||
test("DataFrame[String] with more than 10M unique values in one partition to H2OFrame[T_STR] and back") { | ||
testDataFrameConversionWithHighNumberOfCategoricalLevels(1) | ||
} | ||
|
||
test("DataFrame[String] with more than 10M unique values in 100 partitions to H2OFrame[T_STR] and back") { | ||
testDataFrameConversionWithHighNumberOfCategoricalLevels(100) | ||
} | ||
|
||
def testDataFrameConversionWithHighNumberOfCategoricalLevels(numPartitions: Int) { | ||
val uniqueValues = 1 to (Categorical.MAX_CATEGORICAL_COUNT * 1.1).toInt | ||
val values = uniqueValues.map(i => (i % (Categorical.MAX_CATEGORICAL_COUNT + 1)).toHexString) | ||
val rdd = sc.parallelize(values, numPartitions) | ||
|
||
val df = rdd.toDF("strings") | ||
val h2oFrame = hc.asH2OFrame(df) | ||
|
||
assertH2OFrameInvariants(df, h2oFrame) | ||
assert(h2oFrame.columns(0).isString()) | ||
|
||
val resultDF = hc.asSparkFrame(h2oFrame) | ||
TestUtils.assertDataFramesAreIdentical(df, resultDF) | ||
h2oFrame.delete() | ||
} | ||
} | ||
|
||
test("DataFrame[String] with only unique values with in one partition to H2OFrame[T_STR] and back") { | ||
testDataFrameConversionWithOnlyUniqueValues(1) | ||
} | ||
|
||
test("DataFrame[String] with only unique values with in 100 partitions to H2OFrame[T_STR] and back") { | ||
testDataFrameConversionWithOnlyUniqueValues(100) | ||
} | ||
|
||
def testDataFrameConversionWithOnlyUniqueValues(numPartitions: Int) { | ||
val uniqueValues = (1 to (Categorical.MAX_CATEGORICAL_COUNT / 10)).map(_.toHexString) | ||
val rdd = sc.parallelize(uniqueValues, numPartitions) | ||
|
||
val df = rdd.toDF("strings") | ||
val h2oFrame = hc.asH2OFrame(df) | ||
|
||
assertH2OFrameInvariants(df, h2oFrame) | ||
assert(h2oFrame.columns(0).isString()) | ||
|
||
val resultDF = hc.asSparkFrame(h2oFrame) | ||
TestUtils.assertDataFramesAreIdentical(df, resultDF) | ||
h2oFrame.delete() | ||
} | ||
|
||
private def assertH2OFrameInvariants(inputDF: DataFrame, df: H2OFrame): Unit = { | ||
assert(inputDF.count == df.numberOfRows, "Number of rows has to match") | ||
assert(df.numberOfColumns == SchemaUtils.flattenSchema(inputDF).length, "Number columns should match") | ||
} | ||
} |
Oops, something went wrong.