diff --git a/README.md b/README.md index 4942d51..9a515b6 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,29 @@ A `CSVReader` parses CSV data from a given input (`String`, `Data`, `URL`, or `I Loading all data into memory may provide faster iteration for small to medium size files, since you get rid of the overhead of managing an `InputStream`. +- `lastFieldDelimiterStrategy` (default `.parse`) indicates whether to ignore extra delimiters in the last field. + + This can sometimes be helpful when parsing unquoted CSV, e.g. an address field that contains data such as 'Street, City, Region'. + + For example: + ```swift + $0.lastFieldDelimiterStrategy = .ignore + ``` + + The following **unquoted** text: + ``` + Header1, Header2 + A, B, C, D + ``` + + would result in it being parsed as though it was the following quoted text: + ``` + Header1, Header2 + "A", "B, C, D" + ``` + + + The configuration values are set during initialization and can be passed to the `CSVReader` instance through a structure or with a convenience closure syntax: ```swift diff --git a/sources/imperative/reader/Reader.swift b/sources/imperative/reader/Reader.swift index ee0d9ff..dc58f13 100644 --- a/sources/imperative/reader/Reader.swift +++ b/sources/imperative/reader/Reader.swift @@ -109,12 +109,14 @@ extension CSVReader { loop: while true { let result: [String]? + do { result = try self._parseLine(rowIndex: self.count.rows) } catch let error { self.status = .failed(error as! CSVError) throw error } + // If no fields were parsed, the EOF has been reached. guard let fields = result else { self.status = .finished @@ -184,7 +186,8 @@ extension CSVReader { break loop // 7. If a regular character is encountered, an "unescaped field" is awaiting parsing. } else { - let field = try self._parseUnescapedField(starting: scalar, rowIndex: rowIndex) + let isLastField = self.count.fields > 0 && self.count.fields == result.count + 1 + let field = try self._parseUnescapedField(starting: scalar, rowIndex: rowIndex, isLastField: isLastField) result.append(field.value) if field.isAtEnd { break loop } } @@ -198,7 +201,7 @@ extension CSVReader { /// - parameter rowIndex: The index of the row being parsed. /// - throws: `CSVError` exclusively. /// - returns: The parsed field and whether the row/file ending characters have been found. - private func _parseUnescapedField(starting: Unicode.Scalar, rowIndex: Int) throws -> (value: String, isAtEnd: Bool) { + private func _parseUnescapedField(starting: Unicode.Scalar, rowIndex: Int, isLastField: Bool) throws -> (value: String, isAtEnd: Bool) { var reachedRowsEnd = false self._fieldBuffer.append(starting) @@ -213,9 +216,14 @@ extension CSVReader { if scalar == self._settings.escapingScalar { throw Error._invalidUnescapedField(rowIndex: rowIndex) // 4. If the field delimiter is encountered, return the already parsed characters. + } else if try self._isFieldDelimiter(scalar) { - reachedRowsEnd = false - break fieldLoop + if configuration.lastFieldDelimiterStrategy == .parse || !isLastField { + reachedRowsEnd = false + break fieldLoop + } else { // if last field then treat delimiter as a regular scalar + self._fieldBuffer.append(scalar) + } // 5. If the row delimiter is encountered, return the already parsed characters. } else if try self._isRowDelimiter(scalar) { reachedRowsEnd = true diff --git a/sources/imperative/reader/ReaderConfiguration.swift b/sources/imperative/reader/ReaderConfiguration.swift index 1e8920c..be26396 100644 --- a/sources/imperative/reader/ReaderConfiguration.swift +++ b/sources/imperative/reader/ReaderConfiguration.swift @@ -15,6 +15,8 @@ extension CSVReader { public var headerStrategy: Strategy.Header /// Trims the given characters at the beginning and end of each row, and between fields. public var trimStrategy: CharacterSet + /// Ignores delimiters in last field - useful for parsing some unquoted types of CSV + public var lastFieldDelimiterStrategy: Strategy.lastFieldDelimiterStrategy /// Boolean indicating whether the data/file/string should be completely parsed at reader's initialization. public var presample: Bool @@ -24,6 +26,7 @@ extension CSVReader { self.delimiters = (field: ",", row: "\n") self.escapingStrategy = .doubleQuote self.headerStrategy = .none + self.lastFieldDelimiterStrategy = .parse self.trimStrategy = CharacterSet() self.presample = false } @@ -49,3 +52,20 @@ extension Strategy { } } } + + +extension Strategy { + public enum lastFieldDelimiterStrategy: ExpressibleByNilLiteral, ExpressibleByBooleanLiteral { + + case parse + case ignore + + public init(nilLiteral: ()) { + self = .parse + } + + public init(booleanLiteral value: BooleanLiteralType) { + self = (value) ? .ignore : .parse + } + } +} diff --git a/tests/MyCustomTests.swift b/tests/MyCustomTests.swift new file mode 100644 index 0000000..a729909 --- /dev/null +++ b/tests/MyCustomTests.swift @@ -0,0 +1,31 @@ +// +// TestIgnoreDelimitersInLastField.swift +// +// +// Created by Jon Lidgard on 17/04/2022. +// + +import XCTest +import CodableCSV + + +class MyCustomTests: XCTestCase { + override func setUp() { + self.continueAfterFailure = false + } +} + + +extension MyCustomTests { + + func testIgnoreExtraDelimitersInLastField() throws { + let input = """ + a,b + §A,BC, DE + """ + XCTAssertNoThrow(try CSVReader.decode(input: input) { + $0.headerStrategy = .firstLine + $0.lastFieldDelimiterStrategy = .ignore + }) + } +}