Skip to content

Commit

Permalink
Support Scala 2.13 / Spark 3.2; drop Scala 2.11 / Spark 2 support (#564)
Browse files Browse the repository at this point in the history
  • Loading branch information
srowen committed Oct 21, 2021
1 parent c7ac4d5 commit a17f473
Show file tree
Hide file tree
Showing 7 changed files with 24 additions and 21 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Spark 2 / Java 8
name: Spark 3.2 / Java 11 / Scala 2.13
on:
push:
branches: [master]
Expand All @@ -12,6 +12,6 @@ jobs:
- name: Set up Java, SBT
uses: olafurpg/setup-scala@v11
with:
java-version: 'adopt@1.8'
java-version: 'adopt@1.11'
- name: Build and test
run: sbt -Dspark.testVersion=2.4.8 ++2.11.12 clean scalastyle test:scalastyle mimaReportBinaryIssues test
run: sbt -Dspark.testVersion=3.2.0 ++2.13.5 clean test
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Spark 3 / Java 11
name: Spark 3 / Java 8 / Scala 2.12
on:
push:
branches: [master]
Expand All @@ -12,8 +12,8 @@ jobs:
- name: Set up Java, SBT
uses: olafurpg/setup-scala@v11
with:
java-version: 'adopt@1.11'
java-version: 'adopt@1.8'
- name: Build and test
run: sbt -Dspark.testVersion=3.1.2 ++2.12.10 clean scalastyle test:scalastyle mimaReportBinaryIssues coverage test coverageReport
run: sbt -Dspark.testVersion=3.0.3 ++2.12.10 clean scalastyle test:scalastyle mimaReportBinaryIssues coverage test coverageReport
- name: Check code coverage
run: bash <(curl -s https://codecov.io/bash)
run: bash <(curl -s https://codecov.io/bash)
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ The structure and test tools are mostly copied from [CSV Data Source for Spark](

- This package supports to process format-free XML files in a distributed way, unlike JSON datasource in Spark restricts in-line JSON format.

- Compatible with Spark 2.4.x and 3.x, with Scala 2.12. Scala 2.11 support with Spark 2.4.x is deprecated.
- Compatible with Spark 3.0 and later with Scala 2.12, and also Spark 3.2 and later with Scala 2.12 or 2.13. Scala 2.11 and Spark 2 support ended with version 0.13.0.

## Linking

Expand All @@ -16,15 +16,15 @@ You can link against this library in your program at the following coordinates:
```
groupId: com.databricks
artifactId: spark-xml_2.12
version: 0.13.0
version: 0.14.0
```

## Using with Spark shell

This package can be added to Spark using the `--packages` command line option. For example, to include it when starting the spark shell:

```
$SPARK_HOME/bin/spark-shell --packages com.databricks:spark-xml_2.12:0.13.0
$SPARK_HOME/bin/spark-shell --packages com.databricks:spark-xml_2.12:0.14.0
```

## Features
Expand Down Expand Up @@ -399,7 +399,7 @@ Automatically infer schema (data types)
```R
library(SparkR)

sparkR.session("local[4]", sparkPackages = c("com.databricks:spark-xml_2.12:0.13.0"))
sparkR.session("local[4]", sparkPackages = c("com.databricks:spark-xml_2.12:0.14.0"))

df <- read.df("books.xml", source = "xml", rowTag = "book")

Expand All @@ -411,7 +411,7 @@ You can manually specify schema:
```R
library(SparkR)

sparkR.session("local[4]", sparkPackages = c("com.databricks:spark-xml_2.12:0.13.0"))
sparkR.session("local[4]", sparkPackages = c("com.databricks:spark-xml_2.12:0.14.0"))
customSchema <- structType(
structField("_id", "string"),
structField("author", "string"),
Expand Down
6 changes: 3 additions & 3 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
name := "spark-xml"

version := "0.13.0"
version := "0.14.0"

organization := "com.databricks"

scalaVersion := "2.12.10"

crossScalaVersions := Seq("2.11.12", "2.12.10")
crossScalaVersions := Seq("2.12.10", "2.13.5")

scalacOptions := Seq("-unchecked", "-deprecation")

val sparkVersion = sys.props.get("spark.testVersion").getOrElse("2.4.8")
val sparkVersion = sys.props.get("spark.testVersion").getOrElse("3.2.0")

// To avoid packaging it, it's Provided below
autoScalaLibrary := false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,7 @@ case class XmlDataToCatalyst(
case _: StructType => Seq(StringType)
case ArrayType(_: StructType, _) => Seq(ArrayType(StringType))
}

// Overrides, in Spark 3.2.0+
protected def withNewChildInternal(newChild: Expression): XmlDataToCatalyst = copy(newChild)
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ private[xml] object StaxXmlGenerator {
writer.writeAttribute(name.substring(options.attributePrefix.length), v.toString)

// For ArrayType, we just need to write each as XML element.
case (ArrayType(ty, _), v: Seq[_]) =>
case (ArrayType(ty, _), v: scala.collection.Seq[_]) =>
v.foreach { e =>
writeChildElement(name, ty, e)
}
Expand Down Expand Up @@ -101,7 +101,7 @@ private[xml] object StaxXmlGenerator {
// this case only can happen when we convert a normal [[DataFrame]] to XML file.
// When [[ArrayType]] has [[ArrayType]] as elements, it is confusing what is element name
// for XML file. Now, it is "item" but this might have to be according the parent field name.
case (ArrayType(ty, _), v: Seq[_]) =>
case (ArrayType(ty, _), v: scala.collection.Seq[_]) =>
v.foreach { e =>
writeChild("item", ty, e)
}
Expand Down
8 changes: 4 additions & 4 deletions src/test/scala/com/databricks/spark/xml/XmlSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -890,17 +890,17 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
val resultsOne = spark.read
.option("treatEmptyValuesAsNulls", "true")
.xml(resDir + "gps-empty-field.xml")
assert(resultsOne.selectExpr("extensions.TrackPointExtension").head.getStruct(0) !== null)
assert(resultsOne.selectExpr("extensions.TrackPointExtension").head().getStruct(0) !== null)
assert(resultsOne.selectExpr("extensions.TrackPointExtension")
.head.getStruct(0)(0) === null)
.head().getStruct(0)(0) === null)
// Is the behavior below consistent? see line above.
assert(resultsOne.selectExpr("extensions.TrackPointExtension.hr").head.getStruct(0) === null)
assert(resultsOne.selectExpr("extensions.TrackPointExtension.hr").head().getStruct(0) === null)
assert(resultsOne.collect().length === 2)

val resultsTwo = spark.read
.option("nullValue", "2013-01-24T06:18:43Z")
.xml(resDir + "gps-empty-field.xml")
assert(resultsTwo.selectExpr("time").head.getStruct(0) === null)
assert(resultsTwo.selectExpr("time").head().getStruct(0) === null)
assert(resultsTwo.collect().length === 2)
}

Expand Down

0 comments on commit a17f473

Please sign in to comment.