Skip to content

Commit

Permalink
Handle character elements mixed text/node elements (#416)
Browse files Browse the repository at this point in the history
* Exploratory change to handle _first_ character child as _VALUE in mixed text/node children

* Different take: ignore character elements
  • Loading branch information
srowen authored Nov 6, 2019
1 parent 5c2af9a commit cc977ef
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,23 @@ private[xml] object StaxXmlParser extends Serializable {
// For `ArrayType`, it needs to return the type of element. The values are merged later.
convertTo(c.getData, st, options)
case (c: Characters, st: StructType) =>
// This case can be happen when current data type is inferred as `StructType`
// due to `valueTag` for elements having attributes but no child.
val dt = st.filter(_.name == options.valueTag).head.dataType
convertTo(c.getData, dt, options)
// If a value tag is present, this can be an attribute-only element whose values is in that
// value tag field. Or, it can be a mixed-type element with both some character elements
// and other complex structure. Character elements are ignored.
val attributesOnly = st.fields.forall { f =>
f.name == options.valueTag || f.name.startsWith(options.attributePrefix)
}
if (attributesOnly) {
// If everything else is an attribute column, there's no complex structure.
// Just return the value of the character element
val dt = st.find(_.name == options.valueTag).get.dataType
convertTo(c.getData, dt, options)
} else {
// Otherwise, ignore this character element, and continue parsing the following complex
// structure
parser.next
convertObject(parser, st, options)
}
case (c: Characters, dt: DataType) =>
convertTo(c.getData, dt, options)
case (e: XMLEvent, dt: DataType) =>
Expand Down
15 changes: 13 additions & 2 deletions src/main/scala/com/databricks/spark/xml/util/InferSchema.scala
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,19 @@ private[xml] object InferSchema {
case _ => inferField(parser, options)
}
case c: Characters if !c.isWhiteSpace =>
// This means data exists
inferFrom(c.getData, options)
// This could be the characters of a character-only element, or could have mixed
// characters and other complex structure
val characterType = inferFrom(c.getData, options)
parser.nextEvent()
parser.peek match {
case _: StartElement =>
// Some more elements follow; so ignore the characters.
// Use the schema of the rest
inferObject(parser, options).asInstanceOf[StructType]
case _ =>
// That's all, just the character-only body; use that as the type
characterType
}
case e: XMLEvent =>
throw new IllegalArgumentException(s"Failed to parse data with unexpected event $e")
}
Expand Down
5 changes: 5 additions & 0 deletions src/test/resources/mixed_children.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<root>
<foo> issue <bar> lorem </bar> text ignored </foo>
<missing> ipsum </missing>
</root>
5 changes: 5 additions & 0 deletions src/test/resources/mixed_children_2.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<root>
<foo> 3.0 <bar> lorem </bar> text ignored <baz><bing>2</bing></baz> text ignored </foo>
<missing> ipsum </missing>
</root>
22 changes: 22 additions & 0 deletions src/test/scala/com/databricks/spark/xml/XmlSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ final class XmlSuite extends FunSuite with BeforeAndAfterAll {
private val selfClosingTag = resDir + "self-closing-tag.xml"
private val textColumn = resDir + "textColumn.xml"
private val processing = resDir + "processing.xml"
private val mixedChildren = resDir + "mixed_children.xml"
private val mixedChildren2 = resDir + "mixed_children_2.xml"

private val booksTag = "book"
private val booksRootTag = "books"
Expand Down Expand Up @@ -1055,4 +1057,24 @@ final class XmlSuite extends FunSuite with BeforeAndAfterAll {
assert(processingDF.count() === 1)
}

test("test mixed text and element children") {
val mixedDF = spark.read
.option("rowTag", "root")
.option("inferSchema", true)
.xml(mixedChildren)
val mixedRow = mixedDF.head()
assert(mixedRow.getAs[Row](0).toSeq === Seq(" lorem "))
assert(mixedRow.getString(1) === " ipsum ")
}

test("test mixed text and complex element children") {
val mixedDF = spark.read
.option("rowTag", "root")
.option("inferSchema", true)
.xml(mixedChildren2)
assert(mixedDF.select("foo.bar").head().getString(0) === " lorem ")
assert(mixedDF.select("foo.baz.bing").head().getLong(0) === 2)
assert(mixedDF.select("missing").head().getString(0) === " ipsum ")
}

}

0 comments on commit cc977ef

Please sign in to comment.