From 62d626072689fcf76008e814ce62f82f7c209554 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 21 Apr 2016 00:01:10 +0100 Subject: [PATCH] BUG: Parse custom terminator with whitespace delimiter Addresses BUG issue part of gh-12912. Closes gh-12912. --- doc/source/whatsnew/v0.18.1.txt | 1 + pandas/io/tests/test_parsers.py | 9 ++ pandas/src/parser/tokenizer.c | 259 +++++++++++++++++++++++++++++++- 3 files changed, 265 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 821f093083026..d386f32d35195 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -302,6 +302,7 @@ Bug Fixes - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`) - Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`) - Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`) +- Bug in ``read_csv`` when specifying ``delim_whitespace=True`` and ``lineterminator`` simultaneously with the C engine (:issue:`12912`) - Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`). - Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`) - Bug in ``groupby`` where complex types are coerced to float (:issue:`12902`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index ab6103f0f523c..1fab316d80ae6 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3878,6 +3878,15 @@ def test_buffer_rd_bytes(self): except Exception as e: pass + def test_delim_whitespace_custom_terminator(self): + # See gh-12912 + data = """a b c~1 2 3~4 5 6~7 8 9""" + df = self.read_csv(StringIO(data), lineterminator='~', + delim_whitespace=True) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=['a', 'b', 'c']) + tm.assert_frame_equal(df, expected) + class TestCParserHighMemory(CParserTests, CompressionTests, tm.TestCase): engine = 'c' diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index a75ce2bde80e6..060dba820ea8d 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1641,6 +1641,251 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) return 0; } +// custom line terminator +int tokenize_whitespace_customterm(parser_t *self, size_t line_limit) +{ + int i, slen, start_lines; + long maxstreamsize; + char c; + char *stream; + char *buf = self->data + self->datapos; + + start_lines = self->lines; + + if (make_stream_space(self, self->datalen - self->datapos) < 0) { + self->error_msg = "out of memory"; + return -1; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + maxstreamsize = self->stream_cap; + + TRACE(("%s\n", buf)); + + for (i = self->datapos; i < self->datalen; ++i) + { + // next character in file + c = *buf++; + + TRACE(("tokenize_whitespace_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); + + switch(self->state) { + + case SKIP_LINE: + if (c == self->lineterminator) { + END_LINE(); + } + break; + + case WHITESPACE_LINE: + if (c == self->lineterminator) { + self->file_lines++; + self->state = START_RECORD; + break; + } + // fall through + + case EAT_WHITESPACE: + if (c == self->lineterminator) { + END_LINE(); + self->state = START_RECORD; + break; + } else if (!IS_WHITESPACE(c)) { + self->state = START_FIELD; + // fall through to subsequent state + } else { + // if whitespace char, keep slurping + break; + } + + case START_RECORD: + // start of record + if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; + if (c == self->lineterminator) { + END_LINE(); + } + break; + } else if (c == self->lineterminator) { + if (self->skip_empty_lines) { + self->file_lines++; + } else { + END_LINE(); + } + break; + } else if (IS_WHITESPACE(c)) { + if (self->skip_empty_lines) + self->state = WHITESPACE_LINE; + else + self->state = EAT_WHITESPACE; + break; + } else if (c == self->commentchar) { + self->state = EAT_LINE_COMMENT; + break; + } else { + // nominal character - handle as START_FIELD + self->state = START_FIELD; + } + // fall through + + case START_FIELD: + // expecting field + if (c == self->lineterminator) { + END_FIELD(); + END_LINE(); + } else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + // start quote field + self->state = IN_QUOTED_FIELD; + } else if (c == self->escapechar) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_WHITESPACE(c)) { + self->state = EAT_WHITESPACE; + } else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } else { + // begin new unquoted field + if (self->quoting == QUOTE_NONNUMERIC) + self->numeric_field = 1; + + PUSH_CHAR(c); + self->state = IN_FIELD; + } + break; + + case EAT_LINE_COMMENT: + if (c == self->lineterminator) { + self->file_lines++; + self->state = START_RECORD; + } + break; + + case ESCAPED_CHAR: + PUSH_CHAR(c); + self->state = IN_FIELD; + break; + + case IN_FIELD: + // in unquoted field + if (c == self->lineterminator) { + END_FIELD(); + END_LINE(); + } else if (c == self->escapechar) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_WHITESPACE(c)) { + // end of field (end of line not reached yet) + END_FIELD(); + self->state = EAT_WHITESPACE; + } else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } else { + // normal character - save in field + PUSH_CHAR(c); + } + break; + + case IN_QUOTED_FIELD: + // in quoted field + if (c == self->escapechar) { + // possible escape character + self->state = ESCAPE_IN_QUOTED_FIELD; + } else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + if (self->doublequote) { + // double quote - " represented by "" + self->state = QUOTE_IN_QUOTED_FIELD; + } + else { + // end of quote part of field + self->state = IN_FIELD; + } + } else { + // normal character - save in field + PUSH_CHAR(c); + } + break; + + case ESCAPE_IN_QUOTED_FIELD: + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + break; + + case QUOTE_IN_QUOTED_FIELD: + // double quote - seen a quote in an quoted field + if (self->quoting != QUOTE_NONE && c == self->quotechar) { + // save "" as " + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + } else if (IS_WHITESPACE(c)) { + // end of field (end of line not reached yet) + END_FIELD(); + self->state = EAT_WHITESPACE; + } else if (c == self->lineterminator) { + END_FIELD(); + END_LINE(); + } else if (!self->strict) { + PUSH_CHAR(c); + self->state = IN_FIELD; + } else { + self->error_msg = (char*) malloc(50); + sprintf(self->error_msg, "'%c' expected after '%c'", + self->delimiter, self->quotechar); + goto parsingerror; + } + break; + + case EAT_CRNL: + if (c == self->lineterminator) { + END_LINE(); + } else if (IS_WHITESPACE(c)){ + // Handle \r-delimited files + END_LINE_STATE(EAT_WHITESPACE); + } else { + /* XXX + * first character of a new record--need to back up and reread + * to handle properly... + */ + i--; buf--; // back up one character (HACK!) + END_LINE_STATE(START_RECORD); + } + break; + + case EAT_COMMENT: + if (c == self->lineterminator) { + END_LINE(); + } + break; + + default: + break; + } + } + + _TOKEN_CLEANUP(); + + TRACE(("Finished tokenizing input\n")) + + return 0; + +parsingerror: + i++; + _TOKEN_CLEANUP(); + + return -1; + +linelimit: + i++; + _TOKEN_CLEANUP(); + + return 0; +} static int parser_handle_eof(parser_t *self) { TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) @@ -1851,11 +2096,17 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { int start_lines = self->lines; if (self->delim_whitespace) { - tokenize_bytes = tokenize_whitespace; - } else if (self->lineterminator == '\0') { - tokenize_bytes = tokenize_delimited; + if (self->lineterminator == '\0') { + tokenize_bytes = tokenize_whitespace; + } else { + tokenize_bytes = tokenize_whitespace_customterm; + } } else { - tokenize_bytes = tokenize_delim_customterm; + if (self->lineterminator == '\0') { + tokenize_bytes = tokenize_delimited; + } else { + tokenize_bytes = tokenize_delim_customterm; + } } if (self->state == FINISHED) {