Skip to content

Commit

Permalink
Fix whitespace as a separator in CSV processor (#67045) (#67050)
Browse files Browse the repository at this point in the history
This change fixes problem when using space or tab as a separator in CSV processor - we check if current character is separator before we check if it is whitespace.

This also improves tests to always check all combinations of separators and quotes.

Closes #67013
  • Loading branch information
probakowski committed Jan 5, 2021
1 parent 6175e18 commit fdd2a9c
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,15 @@ private boolean processUnquoted() {
char c = currentChar();
if (c == LF || c == CR || c == quote) {
throw new IllegalArgumentException("Illegal character inside unquoted field at " + currentIndex);
} else if (trim && isWhitespace(c)) {
spaceCount++;
} else if (c == separator) {
state = State.START;
if (setField(currentIndex - spaceCount)) {
return true;
}
startIndex = currentIndex + 1;
return false;
} else if (trim && isWhitespace(c)) {
spaceCount++;
} else {
spaceCount = 0;
}
Expand Down Expand Up @@ -163,20 +163,20 @@ private boolean processQuotedEnd() {
boolean shouldSetField = true;
for (; currentIndex < length; currentIndex++) {
c = currentChar();
if (isWhitespace(c)) {
if (shouldSetField) {
if (setField(currentIndex - 1)) {
return true;
}
shouldSetField = false;
}
} else if (c == separator) {
if (c == separator) {
if (shouldSetField && setField(currentIndex - 1)) {
return true;
}
startIndex = currentIndex + 1;
state = State.START;
return false;
} else if (isWhitespace(c)) {
if (shouldSetField) {
if (setField(currentIndex - 1)) {
return true;
}
shouldSetField = false;
}
} else {
throw new IllegalArgumentException("character '" + c + "' after quoted field at " + currentIndex);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,33 +24,36 @@
import org.elasticsearch.ingest.IngestDocument;
import org.elasticsearch.ingest.RandomDocumentPicks;
import org.elasticsearch.test.ESTestCase;
import org.junit.Before;

import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.stream.Collectors;

public class CsvProcessorTests extends ESTestCase {

private static final Character[] SEPARATORS = new Character[]{',', ';', '|', '.'};
private static final Character[] SEPARATORS = new Character[]{',', ';', '|', '.', '\t'};
private static final String[] QUOTES = new String[]{"'", "\"", ""};
private final String quote;
private char separator;
private final char separator;


public CsvProcessorTests(@Name("quote") String quote) {
public CsvProcessorTests(@Name("quote") String quote, @Name("separator") char separator) {
this.quote = quote;
this.separator = separator;
}

@ParametersFactory
public static Iterable<Object[]> parameters() {
return Arrays.asList(new Object[]{"'"}, new Object[]{"\""}, new Object[]{""});
}

@Before
public void setup() {
separator = randomFrom(SEPARATORS);
LinkedList<Object[]> list = new LinkedList<>();
for (Character separator : SEPARATORS) {
for (String quote : QUOTES) {
list.add(new Object[]{quote, separator});
}
}
return list;
}

public void testExactNumberOfFields() {
Expand Down

0 comments on commit fdd2a9c

Please sign in to comment.