From bfeae9dbee2335efdc0a62248dd87f04f215c8d2 Mon Sep 17 00:00:00 2001 From: Jon Lidgard Date: Mon, 18 Apr 2022 17:01:42 +0100 Subject: [PATCH 1/2] Added ability to ignore delimiters in last field --- .github/ISSUE_TEMPLATE/bug_report.md | 4 +- .github/ISSUE_TEMPLATE/question.md | 4 +- .github/workflows/tests.yml | 8 +-- CodableCSV.podspec | 6 +- README.md | 27 ++++++++- docs/assets/badges/Apple.svg | 2 +- sources/Strategy.swift | 2 +- sources/Utils.swift | 2 +- sources/imperative/reader/Reader.swift | 60 +++++++++++-------- .../reader/ReaderConfiguration.swift | 20 +++++++ .../reader/internal/ReaderInternals.swift | 2 +- tests/MyCustomTests.swift | 31 ++++++++++ 12 files changed, 125 insertions(+), 43 deletions(-) create mode 100644 tests/MyCustomTests.swift diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 4130ad5..3dcfd98 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -19,8 +19,8 @@ Steps to reproduce the behavior: A clear and concise description of what you expected to happen. ## System - - OS: [e.g. macOS 11.2, iOS 14.4, Ubuntu 20.04] - - CodableCSV: [e.g. 0.6.6] + - OS: [e.g. macOS 11.5, iOS 14.7, Ubuntu 20.04] + - CodableCSV: [e.g. 0.6.7] You can check this in your SPM `Package.swift` file (or `Package.resolved` file). Alternatively, go to Xcode's Source Control Navigator (`⌘+2`) and click on `CodableCSV`. ## Additional context diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index b4b0e3f..69027a3 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -15,6 +15,6 @@ Add any other context about the question here (or delete this section if it is u ## System Delete section if not applicable - - OS: [e.g. macOS 11.2, iOS 14.4, Ubuntu 20.04] - - CodableCSV: [e.g. 0.6.6] + - OS: [e.g. macOS 11.5, iOS 14.7, Ubuntu 20.04] + - CodableCSV: [e.g. 0.6.7] You can check this in your SPM `Package.swift` file (or `Package.resolved` file). Alternatively, go to Xcode's Source Control Navigator (`⌘+2`) and click on `CodableCSV`. diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3b8b6ae..37fff61 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,9 +8,9 @@ jobs: steps: - uses: actions/checkout@v2 - name: Build - run: swift build -v -c release + run: swift build -v -c debug - name: Run tests - run: swift test -v -c release --filter CodableCSVTests + run: swift test -v -c debug --filter CodableCSVTests unittests_on_Ubuntu: name: Unit tests on Ubuntu @@ -20,6 +20,6 @@ jobs: steps: - uses: actions/checkout@v2 - name: Build - run: swift build -v -c release + run: swift build -v -c debug - name: Run tests - run: swift test -v -c release --filter CodableCSVTests --enable-test-discovery + run: swift test -v -c debug --filter CodableCSVTests --enable-test-discovery diff --git a/CodableCSV.podspec b/CodableCSV.podspec index 318adff..72d5f8d 100644 --- a/CodableCSV.podspec +++ b/CodableCSV.podspec @@ -1,6 +1,6 @@ Pod::Spec.new do |s| s.name = "CodableCSV" - s.version = "0.6.6" + s.version = "0.6.7" s.summary = "Read and write CSV files row-by-row or through Swift's Codable interface." s.description = <<-DESC CodableCSV offers imperative and declarative ways to read and write CSV files. It is extensively configurable and is capable of reading multiple types of entries and write to many outputs. @@ -8,11 +8,11 @@ Pod::Spec.new do |s| s.homepage = "https://github.com/dehesa/CodableCSV" s.license = { :type => "MIT", :file => "LICENSE" } s.author = { "Marcos Sánchez-Dehesa Carballo" => "san.dehesa@gmail.com" } - s.ios.deployment_target = "9.0" + s.ios.deployment_target = "11.0" s.osx.deployment_target = "10.10" s.watchos.deployment_target = "2.0" s.tvos.deployment_target = "9.0" - s.swift_version = ["5.1", "5.2", "5.3", "5.4", "5.5"] + s.swift_version = ["5.3", "5.4", "5.5"] s.source = { :git => "https://github.com/dehesa/CodableCSV.git", :tag => "#{s.version}" } s.source_files = "sources", "sources/**/*.swift" diff --git a/README.md b/README.md index 4ecad6d..9a515b6 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ You can choose to add the library through SPM or Cocoapods: let package = Package( /* Your package name, supported platforms, and generated products go here */ dependencies: [ - .package(url: "https://github.com/dehesa/CodableCSV.git", from: "0.6.6") + .package(url: "https://github.com/dehesa/CodableCSV.git", from: "0.6.7") ], targets: [ .target(name: /* Your target name here */, dependencies: ["CodableCSV"]) @@ -48,7 +48,7 @@ You can choose to add the library through SPM or Cocoapods: - [Cocoapods](https://cocoapods.org). ``` - pod 'CodableCSV', '~> 0.6.6' + pod 'CodableCSV', '~> 0.6.7' ```

@@ -174,6 +174,29 @@ A `CSVReader` parses CSV data from a given input (`String`, `Data`, `URL`, or `I Loading all data into memory may provide faster iteration for small to medium size files, since you get rid of the overhead of managing an `InputStream`. +- `lastFieldDelimiterStrategy` (default `.parse`) indicates whether to ignore extra delimiters in the last field. + + This can sometimes be helpful when parsing unquoted CSV, e.g. an address field that contains data such as 'Street, City, Region'. + + For example: + ```swift + $0.lastFieldDelimiterStrategy = .ignore + ``` + + The following **unquoted** text: + ``` + Header1, Header2 + A, B, C, D + ``` + + would result in it being parsed as though it was the following quoted text: + ``` + Header1, Header2 + "A", "B, C, D" + ``` + + + The configuration values are set during initialization and can be passed to the `CSVReader` instance through a structure or with a convenience closure syntax: ```swift diff --git a/docs/assets/badges/Apple.svg b/docs/assets/badges/Apple.svg index e8ca4f7..328117e 100644 --- a/docs/assets/badges/Apple.svg +++ b/docs/assets/badges/Apple.svg @@ -1 +1 @@ -+2watchOS+9tvOS+8iOS+10.10macOS \ No newline at end of file ++2watchOS+9tvOS+7iOS+10.10macOS \ No newline at end of file diff --git a/sources/Strategy.swift b/sources/Strategy.swift index 4719811..fa5e70a 100644 --- a/sources/Strategy.swift +++ b/sources/Strategy.swift @@ -8,7 +8,7 @@ public enum Strategy { case scalar(Unicode.Scalar) /// Escape double quoted values. - @_transparent public static var doubleQuote: Self { .scalar(#"""#) } + @_transparent public static var doubleQuote: Self { .scalar("\"") } public init(nilLiteral: ()) { self = .none diff --git a/sources/Utils.swift b/sources/Utils.swift index feb93f8..a1cfdb7 100644 --- a/sources/Utils.swift +++ b/sources/Utils.swift @@ -3,7 +3,7 @@ import Foundation extension InputStream { /// Stream for reading from stdin. public static var standardInput: InputStream { - Self(fileAtPath: "/dev/stdin")! + InputStream(fileAtPath: "/dev/stdin")! } } diff --git a/sources/imperative/reader/Reader.swift b/sources/imperative/reader/Reader.swift index 4f158f1..dc58f13 100644 --- a/sources/imperative/reader/Reader.swift +++ b/sources/imperative/reader/Reader.swift @@ -52,7 +52,7 @@ public final class CSVReader: IteratorProtocol, Sequence { guard !headers.isEmpty else { throw Error._invalidEmptyHeader() } self.headers = headers self.count = (rows: 1, fields: headers.count) - // case .unknown: #warning("TODO") +// case .unknown: #warning("TODO") } } @@ -109,12 +109,14 @@ extension CSVReader { loop: while true { let result: [String]? + do { result = try self._parseLine(rowIndex: self.count.rows) } catch let error { self.status = .failed(error as! CSVError) throw error } + // If no fields were parsed, the EOF has been reached. guard let fields = result else { self.status = .finished @@ -159,9 +161,9 @@ extension CSVReader { // 2. Try to retrieve a scalar (if there is none, we reached the EOF). guard let scalar = try self._scalarBuffer.next() ?? self._decoder() else { switch result.isEmpty { - // 2.A. If no fields has been parsed, return nil. + // 2.A. If no fields has been parsed, return nil. case true: return nil - // 2.B. If there were previous fields, the EOF counts as en empty field (since there was no row delimiter previously). + // 2.B. If there were previous fields, the EOF counts as en empty field (since there was no row delimiter previously). case false: result.append(""); break loop } } @@ -175,16 +177,17 @@ extension CSVReader { let field = try self._parseEscapedField(rowIndex: rowIndex, escaping: escapingScalar) result.append(field.value) if field.isAtEnd { break loop } - // 5. If the field delimiter is encountered, an implicit empty field has been defined. + // 5. If the field delimiter is encountered, an implicit empty field has been defined. } else if try self._isFieldDelimiter(scalar) { result.append("") - // 6. If the row delimiter is encountered, an implicit empty field has been defined (for rows that already have content). + // 6. If the row delimiter is encountered, an implicit empty field has been defined (for rows that already have content). } else if try self._isRowDelimiter(scalar) { result.append("") break loop - // 7. If a regular character is encountered, an "unescaped field" is awaiting parsing. + // 7. If a regular character is encountered, an "unescaped field" is awaiting parsing. } else { - let field = try self._parseUnescapedField(starting: scalar, rowIndex: rowIndex) + let isLastField = self.count.fields > 0 && self.count.fields == result.count + 1 + let field = try self._parseUnescapedField(starting: scalar, rowIndex: rowIndex, isLastField: isLastField) result.append(field.value) if field.isAtEnd { break loop } } @@ -198,33 +201,38 @@ extension CSVReader { /// - parameter rowIndex: The index of the row being parsed. /// - throws: `CSVError` exclusively. /// - returns: The parsed field and whether the row/file ending characters have been found. - private func _parseUnescapedField(starting: Unicode.Scalar, rowIndex: Int) throws -> (value: String, isAtEnd: Bool) { + private func _parseUnescapedField(starting: Unicode.Scalar, rowIndex: Int, isLastField: Bool) throws -> (value: String, isAtEnd: Bool) { var reachedRowsEnd = false self._fieldBuffer.append(starting) // 1. This loop continue parsing a unescaped field till the field end is reached. - fieldLoop: while true { - // 2. Try to retrieve an scalar (if not, it is the EOF). - guard let scalar = try self._scalarBuffer.next() ?? self._decoder() else { - reachedRowsEnd = true - break fieldLoop - } - // 3. A escaping scalar cannot appear on unescaped fields. If one is encountered, an error is thrown. - if scalar == self._settings.escapingScalar { - throw Error._invalidUnescapedField(rowIndex: rowIndex) + fieldLoop: while true { + // 2. Try to retrieve an scalar (if not, it is the EOF). + guard let scalar = try self._scalarBuffer.next() ?? self._decoder() else { + reachedRowsEnd = true + break fieldLoop + } + // 3. A escaping scalar cannot appear on unescaped fields. If one is encountered, an error is thrown. + if scalar == self._settings.escapingScalar { + throw Error._invalidUnescapedField(rowIndex: rowIndex) // 4. If the field delimiter is encountered, return the already parsed characters. - } else if try self._isFieldDelimiter(scalar) { - reachedRowsEnd = false - break fieldLoop + + } else if try self._isFieldDelimiter(scalar) { + if configuration.lastFieldDelimiterStrategy == .parse || !isLastField { + reachedRowsEnd = false + break fieldLoop + } else { // if last field then treat delimiter as a regular scalar + self._fieldBuffer.append(scalar) + } // 5. If the row delimiter is encountered, return the already parsed characters. - } else if try self._isRowDelimiter(scalar) { - reachedRowsEnd = true - break fieldLoop + } else if try self._isRowDelimiter(scalar) { + reachedRowsEnd = true + break fieldLoop // 6. If it is a regular unicode scalar, just store it and continue parsing. - } else { - self._fieldBuffer.append(scalar) + } else { + self._fieldBuffer.append(scalar) + } } - } // 7. Once the end has been reached, a field look-back (starting from the end) is performed to check if there are trim characters. if self._settings.isTrimNeeded { while let lastScalar = self._fieldBuffer.last, self._settings.trimCharacters.contains(lastScalar) { diff --git a/sources/imperative/reader/ReaderConfiguration.swift b/sources/imperative/reader/ReaderConfiguration.swift index 1e8920c..be26396 100644 --- a/sources/imperative/reader/ReaderConfiguration.swift +++ b/sources/imperative/reader/ReaderConfiguration.swift @@ -15,6 +15,8 @@ extension CSVReader { public var headerStrategy: Strategy.Header /// Trims the given characters at the beginning and end of each row, and between fields. public var trimStrategy: CharacterSet + /// Ignores delimiters in last field - useful for parsing some unquoted types of CSV + public var lastFieldDelimiterStrategy: Strategy.lastFieldDelimiterStrategy /// Boolean indicating whether the data/file/string should be completely parsed at reader's initialization. public var presample: Bool @@ -24,6 +26,7 @@ extension CSVReader { self.delimiters = (field: ",", row: "\n") self.escapingStrategy = .doubleQuote self.headerStrategy = .none + self.lastFieldDelimiterStrategy = .parse self.trimStrategy = CharacterSet() self.presample = false } @@ -49,3 +52,20 @@ extension Strategy { } } } + + +extension Strategy { + public enum lastFieldDelimiterStrategy: ExpressibleByNilLiteral, ExpressibleByBooleanLiteral { + + case parse + case ignore + + public init(nilLiteral: ()) { + self = .parse + } + + public init(booleanLiteral value: BooleanLiteralType) { + self = (value) ? .ignore : .parse + } + } +} diff --git a/sources/imperative/reader/internal/ReaderInternals.swift b/sources/imperative/reader/internal/ReaderInternals.swift index 95c0c79..5f4d252 100644 --- a/sources/imperative/reader/internal/ReaderInternals.swift +++ b/sources/imperative/reader/internal/ReaderInternals.swift @@ -30,7 +30,7 @@ extension CSVReader: Failable { public static func errorDescription(for failure: Error) -> String { switch failure { case .invalidConfiguration: return "Invalid configuration" - // case .inferenceFailure: return "Inference failure" +// case .inferenceFailure: return "Inference failure" case .invalidInput: return "Invalid input" case .streamFailure: return "Stream failure" } diff --git a/tests/MyCustomTests.swift b/tests/MyCustomTests.swift new file mode 100644 index 0000000..a729909 --- /dev/null +++ b/tests/MyCustomTests.swift @@ -0,0 +1,31 @@ +// +// TestIgnoreDelimitersInLastField.swift +// +// +// Created by Jon Lidgard on 17/04/2022. +// + +import XCTest +import CodableCSV + + +class MyCustomTests: XCTestCase { + override func setUp() { + self.continueAfterFailure = false + } +} + + +extension MyCustomTests { + + func testIgnoreExtraDelimitersInLastField() throws { + let input = """ + a,b + §A,BC, DE + """ + XCTAssertNoThrow(try CSVReader.decode(input: input) { + $0.headerStrategy = .firstLine + $0.lastFieldDelimiterStrategy = .ignore + }) + } +} From d89ab527b7a1ab477970e6ab2c918c2b3c056635 Mon Sep 17 00:00:00 2001 From: Jon Lidgard Date: Mon, 18 Apr 2022 18:00:51 +0100 Subject: [PATCH 2/2] Mergewith upstream --- sources/imperative/reader/Reader.swift | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sources/imperative/reader/Reader.swift b/sources/imperative/reader/Reader.swift index a1ef20f..dc58f13 100644 --- a/sources/imperative/reader/Reader.swift +++ b/sources/imperative/reader/Reader.swift @@ -216,7 +216,6 @@ extension CSVReader { if scalar == self._settings.escapingScalar { throw Error._invalidUnescapedField(rowIndex: rowIndex) // 4. If the field delimiter is encountered, return the already parsed characters. -<<<<<<< HEAD } else if try self._isFieldDelimiter(scalar) { if configuration.lastFieldDelimiterStrategy == .parse || !isLastField { @@ -225,11 +224,6 @@ extension CSVReader { } else { // if last field then treat delimiter as a regular scalar self._fieldBuffer.append(scalar) } -======= - } else if try self._isFieldDelimiter(scalar) { - reachedRowsEnd = false - break fieldLoop ->>>>>>> upstream/master // 5. If the row delimiter is encountered, return the already parsed characters. } else if try self._isRowDelimiter(scalar) { reachedRowsEnd = true