Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configure escaping scalar #13

Merged
merged 5 commits into from
Mar 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 6 additions & 5 deletions Sources/Active/Reader/Reader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ extension CSVReader {
}

// 4. If the unicode scalar retrieved is a double quote, an escaped field is awaiting for parsing.
if scalar == self.settings.escapingScalar {
let field = try self.parseEscapedField(rowIndex: rowIndex)
if let escapingScalar = self.settings.escapingScalar, scalar == escapingScalar {
let field = try self.parseEscapedField(rowIndex: rowIndex, escaping: escapingScalar)
result.append(field.value)
if field.isAtEnd { break loop }
// 5. If the field delimiter is encountered, an implicit empty field has been defined.
Expand Down Expand Up @@ -233,21 +233,22 @@ extension CSVReader {
///
/// When this function is executed, the quote opening the "escaped field" has already been read.
/// - parameter rowIndex: The index of the row being parsed.
/// - parameter escapingScalar: The unicode scalar escaping character to use.
/// - throws: `CSVError<CSVReader>` exclusively.
/// - returns: The parsed field and whether the row/file ending characters have been found.
private func parseEscapedField(rowIndex: Int) throws -> (value: String, isAtEnd: Bool) {
private func parseEscapedField(rowIndex: Int, escaping escapingScalar: Unicode.Scalar) throws -> (value: String, isAtEnd: Bool) {
var field: String.UnicodeScalarView = .init()
var reachedRowsEnd = false

fieldLoop: while true {
// 1. Retrieve an scalar (if not there, it means EOF). This case is not allowed without closing the escaping field first.
guard let scalar = try self.buffer.next() ?? self.decoder() else { throw Error.invalidEOF(rowIndex: rowIndex) }
// 2. If the retrieved scalar is not a quote (i.e. "), just store it and continue parsing.
guard scalar == self.settings.escapingScalar else { field.append(scalar); continue fieldLoop }
guard scalar == escapingScalar else { field.append(scalar); continue fieldLoop }
// 3. If the retrieved scalar was a quote, retrieve the following scalar and check if it is EOF. If so, the field has finished and also the row and the file.
guard var followingScalar = try self.buffer.next() ?? self.decoder() else { reachedRowsEnd = true; break fieldLoop }
// 4. If the second retrieved scalar is another quote, the data is escaping a single quote scalar (quotes are escaped with other quotes).
guard followingScalar != self.settings.escapingScalar else { field.append(self.settings.escapingScalar); continue fieldLoop }
guard followingScalar != escapingScalar else { field.append(escapingScalar); continue fieldLoop }
// 5. Once this point is reached, the field has been properly escaped.
if !self.settings.trimCharacters.isEmpty {
// 6. Trim any character after the quote if necessary.
Expand Down
18 changes: 17 additions & 1 deletion Sources/Active/Reader/ReaderConfiguration.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ extension CSVReader {
public var headerStrategy: Strategy.Header
/// Trims the given characters at the beginning and end of each row, and between fields.
public var trimStrategry: CharacterSet
/// The strategy for escaping quoted fields.
public var escapingStrategy: Strategy.Escaping
/// The encoding used to identify the underlying data or `nil` if you want the CSV reader to try to figure it out.
///
/// If no encoding is provided and the input data doesn't contain a Byte Order Marker (BOM), UTF8 is presumed.
Expand All @@ -24,6 +26,7 @@ extension CSVReader {
self.delimiters = (field: ",", row: "\n")
self.headerStrategy = .none
self.trimStrategry = .init()
self.escapingStrategy = .doubleQuote
self.encoding = nil
self.presample = false
}
Expand All @@ -38,7 +41,7 @@ extension CSVReader {
/// The characters set to be trimmed at the beginning and ending of each field.
let trimCharacters: CharacterSet
/// The unicode scalar used as encapsulator and escaping character (when printed two times).
let escapingScalar: Unicode.Scalar = "\""
let escapingScalar: Unicode.Scalar?

/// Creates the inmutable reader settings from the user provided configuration values.
/// - parameter configuration: The configuration values provided by the API user.
Expand All @@ -61,6 +64,12 @@ extension CSVReader {
}
// 2. Set the trim characters set.
self.trimCharacters = configuration.trimStrategry
// 3. Set the escaping scalar.
self.escapingScalar = configuration.escapingStrategy.scalar
// 4. Ensure trim character set does not include escaping scalar
if let escapingScalar = escapingScalar, trimCharacters.contains(escapingScalar) {
throw Error.invalidTrimCharacter(escapingScalar: escapingScalar, trimCharacters: trimCharacters)
}
}
}
}
Expand All @@ -74,4 +83,11 @@ fileprivate extension CSVReader.Error {
help: "Set different delimiters for field and rows.",
userInfo: ["Delimiter": delimiter])
}

static func invalidTrimCharacter(escapingScalar: Unicode.Scalar, trimCharacters: CharacterSet) -> CSVError<CSVReader> {
.init(.invalidConfiguration,
reason: "The trim characters set can not include the escaping scalar.",
help: "Remove the escaping scalar from the trim characters set.",
userInfo: ["Escaping scalar": escapingScalar, "Trim characters": trimCharacters])
}
}
25 changes: 20 additions & 5 deletions Sources/Active/Writer/Writer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -181,28 +181,36 @@ extension CSVWriter {
var result: [Unicode.Scalar]

if field.isEmpty {
result = .init(repeating: escapingScalar, count: 2)
if let escapingScalar = escapingScalar {
result = .init(repeating: escapingScalar, count: 2)
} else {
result = []
}
} else {
let input: [Unicode.Scalar] = .init(field.unicodeScalars)
result = .init()
result.reserveCapacity(input.count + 2)
var (index, needsEscaping) = (0, false)
var index = 0
var needsEscaping: Unicode.Scalar?

while index < input.endIndex {
let scalar = input[index]

if scalar == escapingScalar {
needsEscaping = true
needsEscaping = scalar
} else if self.isFieldDelimiter(input, &index, &result) || self.isRowDelimiter(input, &index, &result) {
needsEscaping = true
needsEscaping = scalar
continue
}

index += 1
result.append(scalar)
}

if needsEscaping {
if let needsEscaping = needsEscaping {
guard let escapingScalar = escapingScalar else {
throw Error.unescapedDelimiter(needsEscaping)
}
result.insert(escapingScalar, at: result.startIndex)
result.append(escapingScalar)
}
Expand All @@ -220,6 +228,13 @@ extension CSVWriter {
}

fileprivate extension CSVWriter.Error {
static func unescapedDelimiter(_ delimiter: Unicode.Scalar) -> CSVError<CSVWriter> {
.init(.invalidInput,
reason: "A field cannot include a delimiter if escaping strategy is disabled.",
help: "Remove delimiter from field or set an escaping strategy.",
userInfo: ["Invalid character": delimiter])

}
/// Error raised when the a field is trying to be writen and it overflows the expected number of fields per row.
static func fieldOverflow(expectedFields: Int) -> CSVError<CSVWriter> {
.init(.invalidOperation,
Expand Down
6 changes: 5 additions & 1 deletion Sources/Active/Writer/WriterConfiguration.swift
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ extension CSVWriter {
public struct Configuration {
/// The field and row delimiters.
public var delimiters: Delimiter.Pair
/// The strategy for escaping quoted fields.
public var escapingStrategy: Strategy.Escaping
/// The row of headers to write at the beginning of the CSV data.
///
/// If empty, no row will be written.
Expand All @@ -19,6 +21,7 @@ extension CSVWriter {
/// Designated initlaizer setting the default values.
public init() {
self.delimiters = (field: ",", row: "\n")
self.escapingStrategy = .doubleQuote
self.headers = .init()
self.encoding = nil
self.bomStrategy = .convention
Expand Down Expand Up @@ -53,7 +56,7 @@ extension CSVWriter {
/// Boolean indicating whether the received CSV contains a header row or not.
let headers: [String]
/// The unicode scalar used as encapsulator and escaping character (when printed two times).
let escapingScalar: Unicode.Scalar = "\""
let escapingScalar: Unicode.Scalar?
/// The encoding used to identify the underlying data.
let encoding: String.Encoding

Expand All @@ -71,6 +74,7 @@ extension CSVWriter {
self.delimiters = (.init(field), .init(row))
}
// 2. Copy all other values.
self.escapingScalar = configuration.escapingStrategy.scalar
self.headers = configuration.headers
self.encoding = encoding
}
Expand Down
26 changes: 26 additions & 0 deletions Sources/Strategy.swift
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,32 @@ public enum Strategy {
}
}

/// The strategy for escaping quoted fields.
public enum Escaping: ExpressibleByNilLiteral, ExpressibleByUnicodeScalarLiteral {
/// CSV delimiters can not be escaped.
case none
/// Ignore delimiter with in a scalar pair.
case scalar(Unicode.Scalar)

/// Escape double quoted values.
public static let doubleQuote: Self = "\""

public init(nilLiteral: ()) { self = .none }

public init(unicodeScalarLiteral value: Unicode.Scalar) {
self = .scalar(value)
}

var scalar: Unicode.Scalar? {
switch self {
case .none:
return nil
case .scalar(let scalar):
return scalar
}
}
}

/// The strategy to use for non-standard floating-point values (IEEE 754 infinity and NaN).
public enum NonConformingFloat {
/// Throw upon encountering non-conforming values. This is the default strategy.
Expand Down
22 changes: 13 additions & 9 deletions Tests/CodableCSVTests/ActiveTests/ReaderTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ extension ReaderTests {
let fieldDelimiters: [Delimiter.Field] = [",", ";", "\t", "|", "||", "|-|"]
let headerStrategy: [Strategy.Header] = [.none, .firstLine, /*.unknown*/]
let trimStrategy: [CharacterSet] = [.init(), .whitespaces]
let escapingStrategy: [Strategy.Escaping] = [.none, .doubleQuote]
let presamples: [Bool] = [true, false]
// The data used for testing.
let (headers, content) = (TestData.headers, TestData.content)
Expand Down Expand Up @@ -126,15 +127,18 @@ extension ReaderTests {
var toTrim = t
if f.rawValue.count == 1, t.contains(f.rawValue.first!) { toTrim.remove(f.rawValue.first!) }
if r.rawValue.count == 1, t.contains(r.rawValue.first!) { toTrim.remove(r.rawValue.first!) }

for p in presamples {
var c = CSVReader.Configuration()
c.delimiters = pair
c.headerStrategy = h
c.trimStrategry = toTrim
c.presample = p

XCTAssertNoThrow(try work(c, encoded))

for e in escapingStrategy {
for p in presamples {
var c = CSVReader.Configuration()
c.delimiters = pair
c.headerStrategy = h
c.trimStrategry = toTrim
c.escapingStrategy = e
c.presample = p

XCTAssertNoThrow(try work(c, encoded))
}
}
}
}
Expand Down
18 changes: 11 additions & 7 deletions Tests/CodableCSVTests/ActiveTests/WriterTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ extension WriterTests {
// The configuration values to be tested.
let rowDelimiters: [Delimiter.Row] = ["\n", "\r", "\r\n", "**~**"]
let fieldDelimiters: [Delimiter.Field] = [",", ";", "\t", "|", "||", "|-|"]
let escapingStrategy: [Strategy.Escaping] = [.none, .doubleQuote]
let encodings: [String.Encoding] = [.utf8, .utf16LittleEndian, .utf16BigEndian, .utf16LittleEndian, .utf32BigEndian]
// The data used for testing.
let headers = TestData.headers
Expand All @@ -69,13 +70,16 @@ extension WriterTests {
let pair: Delimiter.Pair = (f, r)
let sample = TestData.toCSV(input, delimiters: pair)

for encoding in encodings {
var c = CSVWriter.Configuration()
c.delimiters = pair
c.headers = headers
c.encoding = encoding
c.bomStrategy = .never
try work(c, sample)
for escaping in escapingStrategy {
for encoding in encodings {
var c = CSVWriter.Configuration()
c.delimiters = pair
c.escapingStrategy = escaping
c.headers = headers
c.encoding = encoding
c.bomStrategy = .never
try work(c, sample)
}
}
}
}
Expand Down