Skip to content

Commit

Permalink
Shortcut common type inference cases to fail fast, speed up inference (
Browse files Browse the repository at this point in the history
…#660)

* Shortcut to fail date/time parsing if not a date/time

* Don't use exceptions for date/time control flow in the parsing method to make inference faster

* Also shortcut int/float/double parsing where obviously not parseable
  • Loading branch information
srowen committed Sep 7, 2023
1 parent 3d76b79 commit 994e357
Showing 1 changed file with 54 additions and 31 deletions.
85 changes: 54 additions & 31 deletions src/main/scala/com/databricks/spark/xml/util/TypeCast.scala
Expand Up @@ -17,7 +17,7 @@ package com.databricks.spark.xml.util


import java.math.BigDecimal import java.math.BigDecimal
import java.sql.{Date, Timestamp} import java.sql.{Date, Timestamp}
import java.text.{NumberFormat, ParsePosition} import java.text.NumberFormat
import java.time.{Instant, LocalDate, ZoneId} import java.time.{Instant, LocalDate, ZoneId}
import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder}
import java.util.Locale import java.util.Locale
Expand All @@ -26,8 +26,6 @@ import scala.util.control.Exception._
import org.apache.spark.sql.types._ import org.apache.spark.sql.types._
import com.databricks.spark.xml.XmlOptions import com.databricks.spark.xml.XmlOptions


import java.time.temporal.TemporalQueries

/** /**
* Utility functions for type casting * Utility functions for type casting
*/ */
Expand Down Expand Up @@ -63,8 +61,14 @@ private[xml] object TypeCast {
case _: BooleanType => parseXmlBoolean(datum) case _: BooleanType => parseXmlBoolean(datum)
case dt: DecimalType => case dt: DecimalType =>
Decimal(new BigDecimal(datum.replaceAll(",", "")), dt.precision, dt.scale) Decimal(new BigDecimal(datum.replaceAll(",", "")), dt.precision, dt.scale)
case _: TimestampType => parseXmlTimestamp(datum, options) case _: TimestampType =>
case _: DateType => parseXmlDate(datum, options) parseXmlTimestamp(datum, options).getOrElse {
throw new IllegalArgumentException(s"cannot convert value $datum to Timestamp")
}
case _: DateType =>
parseXmlDate(datum, options).getOrElse {
throw new IllegalArgumentException(s"cannot convert value $datum to Date")
}
case _: StringType => datum case _: StringType => datum
case _ => throw new IllegalArgumentException(s"Unsupported type: ${castType.typeName}") case _ => throw new IllegalArgumentException(s"Unsupported type: ${castType.typeName}")
} }
Expand All @@ -85,17 +89,26 @@ private[xml] object TypeCast {
DateTimeFormatter.ISO_DATE DateTimeFormatter.ISO_DATE
) )


private def parseXmlDate(value: String, options: XmlOptions): Date = { private def parseXmlDate(value: String, options: XmlOptions): Option[Date] = {
val formatters = options.dateFormat.map(DateTimeFormatter.ofPattern). // A little shortcut to avoid trying many formatters in the common case that
map(supportedXmlDateFormatters :+ _).getOrElse(supportedXmlDateFormatters) // the input isn't a date. All built-in formats will start with a digit.
formatters.foreach { format => if (value.nonEmpty && Character.isDigit(value.head)) {
supportedXmlDateFormatters.foreach { format =>
try {
return Some(Date.valueOf(LocalDate.parse(value, format)))
} catch {
case _: Exception => // continue
}
}
}
options.dateFormat.map(DateTimeFormatter.ofPattern).foreach { format =>
try { try {
return Date.valueOf(LocalDate.parse(value, format)) return Some(Date.valueOf(LocalDate.parse(value, format)))
} catch { } catch {
case _: Exception => // continue case _: Exception => // continue
} }
} }
throw new IllegalArgumentException(s"cannot convert value $value to Date") None
} }


private val supportedXmlTimestampFormatters = Seq( private val supportedXmlTimestampFormatters = Seq(
Expand All @@ -115,12 +128,16 @@ private[xml] object TypeCast {
DateTimeFormatter.ISO_INSTANT DateTimeFormatter.ISO_INSTANT
) )


private def parseXmlTimestamp(value: String, options: XmlOptions): Timestamp = { private def parseXmlTimestamp(value: String, options: XmlOptions): Option[Timestamp] = {
supportedXmlTimestampFormatters.foreach { format => // A little shortcut to avoid trying many formatters in the common case that
try { // the input isn't a timestamp. All built-in formats will start with a digit.
return Timestamp.from(Instant.from(format.parse(value))) if (value.nonEmpty && Character.isDigit(value.head)) {
} catch { supportedXmlTimestampFormatters.foreach { format =>
case _: Exception => // continue try {
return Some(Timestamp.from(Instant.from(format.parse(value))))
} catch {
case _: Exception => // continue
}
} }
} }
options.timestampFormat.foreach { formatString => options.timestampFormat.foreach { formatString =>
Expand All @@ -138,12 +155,12 @@ private[xml] object TypeCast {
DateTimeFormatter.ofPattern(formatString).withZone(options.timezone.map(ZoneId.of).orNull) DateTimeFormatter.ofPattern(formatString).withZone(options.timezone.map(ZoneId.of).orNull)
} }
try { try {
return Timestamp.from(Instant.from(format.parse(value))) return Some(Timestamp.from(Instant.from(format.parse(value))))
} catch { } catch {
case _: Exception => // continue case _: Exception => // continue
} }
} }
throw new IllegalArgumentException(s"cannot convert value $value to Timestamp") None
} }




Expand Down Expand Up @@ -196,6 +213,12 @@ private[xml] object TypeCast {
} else { } else {
value value
} }
// A little shortcut to avoid trying many formatters in the common case that
// the input isn't a double. All built-in formats will start with a digit or period.
if (signSafeValue.isEmpty ||
!(Character.isDigit(signSafeValue.head) || signSafeValue.head == '.')) {
return false
}
// Rule out strings ending in D or F, as they will parse as double but should be disallowed // Rule out strings ending in D or F, as they will parse as double but should be disallowed
if (value.nonEmpty && (value.last match { if (value.nonEmpty && (value.last match {
case 'd' | 'D' | 'f' | 'F' => true case 'd' | 'D' | 'f' | 'F' => true
Expand All @@ -212,6 +235,11 @@ private[xml] object TypeCast {
} else { } else {
value value
} }
// A little shortcut to avoid trying many formatters in the common case that
// the input isn't a number. All built-in formats will start with a digit.
if (signSafeValue.isEmpty || !Character.isDigit(signSafeValue.head)) {
return false
}
(allCatch opt signSafeValue.toInt).isDefined (allCatch opt signSafeValue.toInt).isDefined
} }


Expand All @@ -221,25 +249,20 @@ private[xml] object TypeCast {
} else { } else {
value value
} }
// A little shortcut to avoid trying many formatters in the common case that
// the input isn't a number. All built-in formats will start with a digit.
if (signSafeValue.isEmpty || !Character.isDigit(signSafeValue.head)) {
return false
}
(allCatch opt signSafeValue.toLong).isDefined (allCatch opt signSafeValue.toLong).isDefined
} }


private[xml] def isTimestamp(value: String, options: XmlOptions): Boolean = { private[xml] def isTimestamp(value: String, options: XmlOptions): Boolean = {
try { parseXmlTimestamp(value, options).nonEmpty
parseXmlTimestamp(value, options)
true
} catch {
case _: IllegalArgumentException => false
}
} }


private[xml] def isDate(value: String, options: XmlOptions): Boolean = { private[xml] def isDate(value: String, options: XmlOptions): Boolean = {
try { parseXmlDate(value, options).nonEmpty
parseXmlDate(value, options)
true
} catch {
case _: IllegalArgumentException => false
}
} }


private[xml] def signSafeToLong(value: String, options: XmlOptions): Long = { private[xml] def signSafeToLong(value: String, options: XmlOptions): Long = {
Expand Down

0 comments on commit 994e357

Please sign in to comment.