Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added support for saving with a QuoteMode #254

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -57,6 +57,7 @@ When reading files the API accepts several options:
* `comment`: skip lines beginning with this character. Default is `"#"`. Disable comments by setting this to `null`.
* `codec`: compression codec to use when saving to file. Should be the fully qualified name of a class implementing `org.apache.hadoop.io.compress.CompressionCodec` or one of case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). Defaults to no compression when a codec is not specified.
* `nullValue`: specificy a string that indicates a null value, any fields matching this string will be set as nulls in the DataFrame
* `quoteMode`: when to quote fields (`ALL`, `MINIMAL` (default), `NON_NUMERIC`, `NONE`), see [Quote Modes](https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/QuoteMode.html)

The package also support saving simple (non-nested) DataFrame. When saving you can specify the delimiter and whether we should generate a header row for the table. See following examples for more details.

Expand Down
11 changes: 10 additions & 1 deletion src/main/scala/com/databricks/spark/csv/package.scala
Expand Up @@ -15,7 +15,7 @@
*/
package com.databricks.spark

import org.apache.commons.csv.CSVFormat
import org.apache.commons.csv.{CSVFormat, QuoteMode}
import org.apache.hadoop.io.compress.CompressionCodec

import org.apache.spark.sql.{DataFrame, SQLContext}
Expand Down Expand Up @@ -121,11 +121,19 @@ package object csv {
throw new Exception("Quotation cannot be more than one character.")
}

val quoteModeString = parameters.getOrElse("quoteMode", "MINIMAL")
val quoteMode: QuoteMode = if (quoteModeString == null) {
null
} else {
QuoteMode.valueOf(quoteModeString.toUpperCase)
}

val nullValue = parameters.getOrElse("nullValue", "null")

val csvFormat = defaultCsvFormat
.withDelimiter(delimiterChar)
.withQuote(quoteChar)
.withQuoteMode(quoteMode)
.withEscape(escapeChar)
.withSkipHeaderRecord(false)
.withNullString(nullValue)
Expand All @@ -141,6 +149,7 @@ package object csv {
val csvFormat = defaultCsvFormat
.withDelimiter(delimiterChar)
.withQuote(quoteChar)
.withQuoteMode(quoteMode)
.withEscape(escapeChar)
.withSkipHeaderRecord(false)
.withNullString(nullValue)
Expand Down
45 changes: 45 additions & 0 deletions src/test/scala/com/databricks/spark/csv/CsvSuite.scala
Expand Up @@ -442,6 +442,51 @@ abstract class AbstractCsvSuite extends FunSuite with BeforeAndAfterAll {
assert(carsCopy.collect.map(_.toString).toSet == cars.collect.map(_.toString).toSet)
}

test("DSL save with a quoteMode") {
// Create temp directory
TestUtils.deleteRecursively(new File(tempEmptyDir))
new File(tempEmptyDir).mkdirs()
val copyFilePath = tempEmptyDir + "cars-copy.csv"

val cars = sqlContext.csvFile(carsFile, parserLib = parserLib)
cars.saveAsCsvFile(copyFilePath, Map("header" -> "true", "quoteMode" -> "ALL"))

val carsCopy = sqlContext.csvFile(copyFilePath + "/")

assert(carsCopy.count == cars.count)
assert(carsCopy.collect.map(_.toString).toSet == cars.collect.map(_.toString).toSet)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm.. Does this really check if the output is quoted or not?

}

test("DSL save with non numeric quoteMode") {
// Create temp directory
TestUtils.deleteRecursively(new File(tempEmptyDir))
new File(tempEmptyDir).mkdirs()
val copyFilePath = tempEmptyDir + "cars-copy.csv"

val cars = sqlContext.csvFile(carsFile, parserLib = parserLib)
cars.saveAsCsvFile(copyFilePath, Map("header" -> "true", "quoteMode" -> "non_numeric"))

val carsCopy = sqlContext.csvFile(copyFilePath + "/")

assert(carsCopy.count == cars.count)
assert(carsCopy.collect.map(_.toString).toSet == cars.collect.map(_.toString).toSet)
}

test("DSL save with null quoteMode") {
// Create temp directory
TestUtils.deleteRecursively(new File(tempEmptyDir))
new File(tempEmptyDir).mkdirs()
val copyFilePath = tempEmptyDir + "cars-copy.csv"

val cars = sqlContext.csvFile(carsFile, parserLib = parserLib)
cars.saveAsCsvFile(copyFilePath, Map("header" -> "true", "quoteMode" -> null))

val carsCopy = sqlContext.csvFile(copyFilePath + "/")

assert(carsCopy.count == cars.count)
assert(carsCopy.collect.map(_.toString).toSet == cars.collect.map(_.toString).toSet)
}

test("DSL save with a compression codec") {
// Create temp directory
TestUtils.deleteRecursively(new File(tempEmptyDir))
Expand Down