diff --git a/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala b/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala index f52c6f4..55564e3 100644 --- a/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala +++ b/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala @@ -60,6 +60,7 @@ private[excel] class ExcelParserOptions( val headerRowCount: Int = parameters.getOrElse("headerRowCount", "1").toInt val maxRowCount: Int = parameters.getOrElse("maxRowCount", "1000").toInt val includeSheetName: Boolean = parameters.getOrElse("includeSheetName", "false").toBoolean + val nulLValue: Option[String] = parameters.get("nullValue") val thresholdBytesForTempFiles: Int = parameters.getOrElse("thresholdBytesForTempFiles", parameters.getOrElse("maxBytesForTempFiles", "100000000")).toInt val schemaMatchColumnName: String = parameters.getOrElse("schemaMatchColumnName", null) @@ -83,6 +84,7 @@ private[excel] object ExcelParserOptions { encoder.encode("headerRowCount") -> "headerRowCount", encoder.encode("maxRowCount") -> "maxRowCount", encoder.encode("includeSheetName") -> "includeSheetName", + encoder.encode("nullValue") -> "nullValue", encoder.encode("maxBytesForTempFiles") -> "maxBytesForTempFiles", encoder.encode("thresholdBytesForTempFiles") -> "thresholdBytesForTempFiles", encoder.encode("schemaMatchColumnName") -> "schemaMatchColumnName" diff --git a/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala b/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala index 04a45ab..e7ab677 100644 --- a/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala +++ b/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala @@ -292,7 +292,12 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO case _ => (null, false) } case CellType.STRING => targetType match { - case _: StringType => (UTF8String.fromString(currentCellValue.getStringValue), true) + case _: StringType => + val cellStringValue = UTF8String.fromString(currentCellValue.getStringValue) + options.nulLValue match { + case Some(nullValue) if cellStringValue.toString.equalsIgnoreCase(nullValue) => (null, true) + case _ => (cellStringValue, true) + } case _ => (null, false) } case _ => (UTF8String.fromString(currentCellValue.toString), true) diff --git a/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala b/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala index 93b7b48..9478782 100644 --- a/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala +++ b/src/test/scala/com/elastacloud/spark/excel/ExcelParserOptionsTests.scala @@ -16,6 +16,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(1) options.maxRowCount should be(1000) options.includeSheetName should be(false) + options.nulLValue should be(None) options.thresholdBytesForTempFiles should be(100000000) options.schemaMatchColumnName should be(null) } @@ -31,6 +32,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(1) options.maxRowCount should be(1000) options.includeSheetName should be(false) + options.nulLValue should be(None) options.thresholdBytesForTempFiles should be(100000000) options.schemaMatchColumnName should be(null) } @@ -43,6 +45,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { "headerRowCount" -> "12", "maxRowCount" -> "2000", "includeSheetName" -> "true", + "nullValue" -> "NA", "maxBytesForTempFiles" -> "10", "schemaMatchColumnName" -> "_isValid" ).asJava) @@ -55,6 +58,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(12) options.maxRowCount should be(2000) options.includeSheetName should be(true) + options.nulLValue should be(Some("NA")) options.thresholdBytesForTempFiles should be(10) options.schemaMatchColumnName should be("_isValid") } @@ -67,6 +71,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { "headerCount" -> "12", "maxRowCont" -> "2000", "includShetNam" -> "true", + "nulvalue" -> "NA", "macsBitesTempFiles" -> "10", "schemaMatchColumName" -> "_isValid" ).asJava) @@ -79,6 +84,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { exception.getMessage.contains("Invalid option 'headercount', did you mean 'headerRowCount'?") should be(true) exception.getMessage.contains("Invalid option 'maxrowcont', did you mean 'maxRowCount'?") should be(true) exception.getMessage.contains("Invalid option 'includshetnam', did you mean 'includeSheetName'?") should be(true) + exception.getMessage.contains("Invalid option 'nulvalue', did you mean 'nullValue'?") should be(true) exception.getMessage.contains("Invalid option 'macsbitestempfiles', did you mean 'maxBytesForTempFiles'") should be(true) exception.getMessage.contains("Invalid option 'schemamatchcolumname', did you mean 'schemaMatchColumnName'") should be(true) } @@ -125,6 +131,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(1) options.maxRowCount should be(1000) options.includeSheetName should be(false) + options.nulLValue should be(None) options.thresholdBytesForTempFiles should be(100000000) options.schemaMatchColumnName should be(null) } @@ -137,6 +144,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { "headerRowCount" -> "12", "maxRowCount" -> "2000", "includeSheetName" -> "true", + "nullValue" -> "NA", "maxBytesForTempFiles" -> "100", "schemaMatchColumnName" -> "_isValid" ) @@ -149,6 +157,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(12) options.maxRowCount should be(2000) options.includeSheetName should be(true) + options.nulLValue should be(Some("NA")) options.thresholdBytesForTempFiles should be(100) options.schemaMatchColumnName should be("_isValid") } @@ -191,6 +200,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { "headerRowCount" -> "17", "maxRowCount" -> "5", "includeSheetName" -> "true", + "nullValue" -> "N/A", "thresholdBytesForTempFiles" -> "12", "schemaMatchColumnName" -> "matchesSchema" ) @@ -203,6 +213,7 @@ class ExcelParserOptionsTests extends AnyFlatSpec with Matchers { options.headerRowCount should be(17) options.maxRowCount should be(5) options.includeSheetName should be(true) + options.nulLValue should be(Some("N/A")) options.thresholdBytesForTempFiles should be(12) options.schemaMatchColumnName should be("matchesSchema") } diff --git a/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala b/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala index 31442d9..b5f9606 100644 --- a/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala +++ b/src/test/scala/com/elastacloud/spark/excel/parser/ExcelParserTests.scala @@ -591,4 +591,42 @@ class ExcelParserTests extends AnyFlatSpec with Matchers { the[ExcelParserException] thrownBy parser.getDataIterator.toList should have message "The specified schema match column is not defined as a boolean type." } } + + "Specifying a null value" should "read the string value as null" in { + withInputStream("/Parser/SimpleWorkbook.xlsx") { inputStream => + val options = new ExcelParserOptions(Map[String, String]( + "nullValue" -> "y" + )) + + val expectedData = Seq( + Vector[Any]("a".asUnsafe, 1D, "x".asUnsafe), + Vector[Any]("b".asUnsafe, 2D, null), + Vector[Any]("c".asUnsafe, 3D, "z".asUnsafe) + ) + + val parser = new ExcelParser(inputStream, options) + val actualData = parser.getDataIterator.toList + + actualData should equal(expectedData) + } + } + + it should "Handle string concatenation formulas" in { + withInputStream("/Parser/ConcatString.xlsx") { inputStream => + val options = new ExcelParserOptions(Map[String, String]( + "nullValue" -> "MR ADAM FOX" + )) + + val expectedData = Seq( + Vector[Any]("Dr".asUnsafe, "Jennifer".asUnsafe, "Alagora".asUnsafe, "Dr Jennifer Alagora".asUnsafe), + Vector[Any]("Mr".asUnsafe, "Adam".asUnsafe, "Fox".asUnsafe, null), + Vector[Any]("Ms".asUnsafe, null, "Proctor".asUnsafe, "Ms Proctor".asUnsafe) + ) + + val parser = new ExcelParser(inputStream, options) + + val actualData = parser.getDataIterator.toList + actualData should equal(expectedData) + } + } }