Permalink
Browse files

Adjusting the parser to fail faster when possible.

  • Loading branch information...
1 parent 6b8d803 commit 876a849599ac42e3d8b2581ecb09bdf994bab3d0 bbazzarrakk committed Sep 11, 2008
Showing with 37 additions and 6 deletions.
  1. +3 −1 CHANGELOG
  2. +17 −4 lib/faster_csv.rb
  3. +17 −1 test/tc_speed.rb
View
@@ -2,7 +2,7 @@
Below is a complete listing of changes for each revision of FasterCSV.
-== 1.2.4
+== 1.4.0
* Added encoding support patch from Michael Reinsch.
* Improved inspect() messages for better IRb support.
@@ -13,6 +13,8 @@ Below is a complete listing of changes for each revision of FasterCSV.
issue where line-ending translation can cause an off-by-one error in seeking
back to a non-zero starting position after auto-discovery for
<tt>:row_sep</tt> as suggested by Robert Battle.
+* Improved the parser to fail faster when fed some forms of invalid CSV that can
+ be detected without reading ahead.
== 1.2.3
View
@@ -75,7 +75,7 @@
#
class FasterCSV
# The version of the installed library.
- VERSION = "1.2.4".freeze
+ VERSION = "1.4.0".freeze
#
# A FasterCSV::Row is part Array and part Hash. It retains an order for the
@@ -1556,7 +1556,7 @@ def shift
# add another read to the line
line += @io.gets(@row_sep) rescue return nil
# copy the line so we can chop it up in parsing
- parse = line.dup
+ parse = line.dup
parse.sub!(@parsers[:line_end], "")
#
@@ -1631,7 +1631,7 @@ def shift
break csv
end
# if we're not empty?() but at eof?(), a quoted field wasn't closed...
- if @io.eof?
+ if @io.eof? or parse =~ @parsers[:bad_field]
raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
end
# otherwise, we need to loop and pull some more data to complete the row
@@ -1771,14 +1771,27 @@ def init_parsers(options)
:leading_fields => Regexp.new("\\A(?:#{esc_col_sep})+", nil, @encoding),
# The Primary Parser
:csv_row => Regexp.new(<<-END_PARSER, Regexp::EXTENDED, @encoding),
- \\G(?:^|#{esc_col_sep}) # anchor the match
+ \\G(?:\\A|#{esc_col_sep}) # anchor the match
(?: #{esc_quote}( (?>[^#{esc_quote}]*) # find quoted fields
(?> #{esc_quote*2}
[^#{esc_quote}]* )* )#{esc_quote}
| # ... or ...
([^#{esc_quote}#{esc_col_sep}]*) # unquoted fields
)
+ (?=#{esc_col_sep}|\\z) # ensure we are at field's end
END_PARSER
+ # a test for unescaped quotes
+ :bad_field => Regexp.new(<<-END_BAD, Regexp::EXTENDED, @encoding),
+ \\A#{esc_col_sep}? # starts with an optional comma
+ (?: #{esc_quote} (?>[^#{esc_quote}]*) # an extra quote
+ (?> #{esc_quote*2}
+ [^#{esc_quote}]* )*
+ #{esc_quote}[^#{esc_quote}]
+ | # ... or ...
+ [^#{esc_quote}#{esc_col_sep}]+
+ #{esc_quote} # unescaped quote
+ )
+ END_BAD
# safer than chomp!()
:line_end => Regexp.new("#{esc_row_sep}\\z", nil, @encoding)
}
View
@@ -6,12 +6,14 @@
# Copyright 2005 Gray Productions. All rights reserved.
require "test/unit"
+require "timeout"
require "faster_csv"
require "csv"
class TestFasterCSVSpeed < Test::Unit::TestCase
- PATH = File.join(File.dirname(__FILE__), "test_data.csv")
+ PATH = File.join(File.dirname(__FILE__), "test_data.csv")
+ BIG_DATA = "123456789\n" * 1024
def test_that_we_are_doing_the_same_work
FasterCSV.open(PATH) do |csv|
@@ -36,4 +38,18 @@ def test_speed_vs_csv
assert(faster_csv_time < csv_time / 3)
end
+
+ def test_the_parse_fails_fast_when_it_can_for_unquoted_fields
+ data = 'valid,fields,bad start"' + BIG_DATA
+ assert_raise(FasterCSV::MalformedCSVError) do
+ Timeout.timeout(0.2) { FasterCSV.parse(data) }
+ end
+ end
+
+ def test_the_parse_fails_fast_when_it_can_for_unescaped_quotes
+ data = 'valid,fields,"bad start"unescaped' + BIG_DATA
+ assert_raise(FasterCSV::MalformedCSVError) do
+ Timeout.timeout(0.2) { FasterCSV.parse(data) }
+ end
+ end
end

0 comments on commit 876a849

Please sign in to comment.