-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
lexer.cr
143 lines (131 loc) · 3.53 KB
/
lexer.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
require "csv"
# A CSV lexer lets you consume a CSV token by token. You can use this to efficiently
# parse a CSV without the need to allocate intermediate arrays.
#
# ```
# require "csv"
#
# lexer = CSV::Lexer.new "one,two\nthree"
# lexer.next_token # => CSV::Token(@kind=Cell, @value="one")
# lexer.next_token # => CSV::Token(@kind=Cell, @value="two")
# lexer.next_token # => CSV::Token(@kind=Newline, @value="two")
# lexer.next_token # => CSV::Token(@kind=Cell, @value="three")
# lexer.next_token # => CSV::Token(@kind=Eof, @value="three")
# ```
abstract class CSV::Lexer
# Creates a CSV lexer from a `String`.
def self.new(string : String, separator = DEFAULT_SEPARATOR, quote_char = DEFAULT_QUOTE_CHAR)
StringBased.new(string, separator, quote_char)
end
# Creates a CSV lexer from an `IO`.
def self.new(io : IO, separator = DEFAULT_SEPARATOR, quote_char = DEFAULT_QUOTE_CHAR)
IOBased.new(io, separator, quote_char)
end
# Returns the current `Token`.
getter token : Token
getter separator : Char
getter quote_char : Char
# :nodoc:
def initialize(@separator : Char = DEFAULT_SEPARATOR, @quote_char : Char = DEFAULT_QUOTE_CHAR)
@token = Token.new
@buffer = IO::Memory.new
@column_number = 1
@line_number = 1
@last_empty_column = false
end
# Rewinds this lexer to the beginning
def rewind
@column_number = 1
@line_number = 1
@last_empty_column = false
end
private abstract def consume_unquoted_cell
private abstract def next_char_no_column_increment
private abstract def current_char
# Returns the next `Token` in this CSV.
def next_token
if @last_empty_column
@last_empty_column = false
@token.kind = Token::Kind::Cell
@token.value = ""
return @token
end
case current_char
when '\0'
@token.kind = Token::Kind::Eof
when @separator
@token.kind = Token::Kind::Cell
@token.value = ""
check_last_empty_column
when '\r'
@token.kind =
case next_char
when '\0'
Token::Kind::Eof
when '\n'
case next_char
when '\0'
Token::Kind::Eof
else
Token::Kind::Newline
end
else
Token::Kind::Newline
end
when '\n'
@token.kind = next_char == '\0' ? Token::Kind::Eof : Token::Kind::Newline
when @quote_char
@token.kind = Token::Kind::Cell
@token.value = consume_quoted_cell
else
@token.kind = Token::Kind::Cell
@token.value = consume_unquoted_cell
end
@token
end
private def consume_quoted_cell
@buffer.clear
while true
case char = next_char
when '\0'
raise "Unclosed quote"
break
when @quote_char
case next_char
when @separator
check_last_empty_column
break
when '\r', '\n', '\0'
break
when @quote_char
@buffer << @quote_char
else
raise "Expecting comma, newline or end, not #{current_char.inspect}"
end
else
@buffer << char
end
end
@buffer.to_s
end
private def check_last_empty_column
case next_char
when '\r', '\n', '\0'
@last_empty_column = true
else
# not empty
end
end
private def next_char
@column_number += 1
char = next_char_no_column_increment
if char == '\n' || char == '\r'
@column_number = 0
@line_number += 1
end
char
end
private def raise(msg)
::raise CSV::MalformedCSVError.new(msg, @line_number, @column_number)
end
end