-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
lexer.cr
151 lines (137 loc) · 3.91 KB
/
lexer.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
require "csv"
# A CSV lexer lets you consume a CSV token by token. You can use this to efficiently
# parse a CSV without the need to allocate intermediate arrays.
#
# ```
# require "csv"
#
# lexer = CSV::Lexer.new "one,two\nthree"
# lexer.next_token # => CSV::Token(@kind=Cell, @value="one")
# lexer.next_token # => CSV::Token(@kind=Cell, @value="two")
# lexer.next_token # => CSV::Token(@kind=Newline, @value="two")
# lexer.next_token # => CSV::Token(@kind=Cell, @value="three")
# lexer.next_token # => CSV::Token(@kind=Eof, @value="three")
# ```
abstract class CSV::Lexer
# Creates a CSV lexer from a `String`.
def self.new(string : String, separator = DEFAULT_SEPARATOR, quote_char = DEFAULT_QUOTE_CHAR)
StringBased.new(string, separator, quote_char)
end
# Creates a CSV lexer from an `IO`.
def self.new(io : IO, separator = DEFAULT_SEPARATOR, quote_char = DEFAULT_QUOTE_CHAR)
IOBased.new(io, separator, quote_char)
end
# Returns the current `Token`.
getter token : Token
getter separator : Char
getter quote_char : Char
# :nodoc:
def initialize(@separator : Char = DEFAULT_SEPARATOR, @quote_char : Char = DEFAULT_QUOTE_CHAR)
@token = Token.new
@buffer = IO::Memory.new
@column_number = 1
@line_number = 1
@last_empty_column = false
# When the lexer finds \n or \r it produces a newline token
# but it doesn't eagerly consume the next token. It does this
# so that if a CSV is streamed from STDIN or from a socket
# the parser will produce a row as soon as a newline is reached,
# without having to wait for more content.
@last_was_slash_r = false
@last_was_slash_n = false
end
# Rewinds this lexer to the beginning
def rewind : Nil
@column_number = 1
@line_number = 1
@last_empty_column = false
@last_was_slash_r = false
@last_was_slash_n = false
end
private abstract def consume_unquoted_cell
private abstract def next_char_no_column_increment
private abstract def current_char
# Returns the next `Token` in this CSV.
def next_token : CSV::Token
if @last_empty_column
@last_empty_column = false
@token.kind = Token::Kind::Cell
@token.value = ""
return @token
end
if @last_was_slash_r
if next_char == '\n'
next_char
end
@last_was_slash_r = false
elsif @last_was_slash_n
next_char
@last_was_slash_n = false
end
case current_char
when '\0'
@token.kind = Token::Kind::Eof
when @separator
@token.kind = Token::Kind::Cell
@token.value = ""
check_last_empty_column
when '\r'
@token.kind = Token::Kind::Newline
@last_was_slash_r = true
when '\n'
@token.kind = Token::Kind::Newline
@last_was_slash_n = true
when @quote_char
@token.kind = Token::Kind::Cell
@token.value = consume_quoted_cell
else
@token.kind = Token::Kind::Cell
@token.value = consume_unquoted_cell
end
@token
end
private def consume_quoted_cell
@buffer.clear
while true
case char = next_char
when '\0'
raise "Unclosed quote"
when @quote_char
case next_char
when @separator
check_last_empty_column
break
when '\r', '\n', '\0'
break
when @quote_char
@buffer << @quote_char
else
raise "Expecting comma, newline or end, not #{current_char.inspect}"
end
else
@buffer << char
end
end
@buffer.to_s
end
private def check_last_empty_column
case next_char
when '\r', '\n', '\0'
@last_empty_column = true
else
# not empty
end
end
private def next_char
@column_number += 1
char = next_char_no_column_increment
if char.in?('\n', '\r')
@column_number = 0
@line_number += 1
end
char
end
private def raise(msg)
::raise CSV::MalformedCSVError.new(msg, @line_number, @column_number)
end
end