forked from gwenn/yacr
/
reader.go
472 lines (447 loc) · 12.1 KB
/
reader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package yacr is yet another CSV reader (and writer) with small memory usage.
package yacr
import (
"bufio"
"bytes"
"encoding"
"fmt"
"io"
"reflect"
"strconv"
)
// Reader provides an interface for reading CSV data
// (compatible with rfc4180 and extended with the option of having a separator other than ",").
// Successive calls to the Scan method will step through the 'fields', skipping the separator/newline between the fields.
// The EndOfRecord method tells when a field is terminated by a line break.
type Reader struct {
*bufio.Scanner
sep byte // values separator
quoted bool // specify if values may be quoted (when they contain separator or newline)
guess bool // try to guess separator based on the file header
eor bool // true when the most recent field has been terminated by a newline (not a separator).
lineno int // current line number (not record number)
Trim bool // trim spaces (only on unquoted values). Break rfc4180 rule: "Spaces are considered part of a field and should not be ignored."
Comment byte // character marking the start of a line comment. When specified (not 0), line comment appears as empty line.
Lazy bool // specify if quoted values may contains unescaped quote not followed by a separator or a newline
Headers map[string]int // Index (first is 1) by header
}
// DefaultReader creates a "standard" CSV reader (separator is comma and quoted mode active)
func DefaultReader(rd io.Reader) *Reader {
return NewReader(rd, ',', true, false)
}
// NewReader returns a new CSV scanner to read from r.
// When quoted is false, values must not contain a separator or newline.
func NewReader(r io.Reader, sep byte, quoted, guess bool) *Reader {
s := &Reader{bufio.NewScanner(r), sep, quoted, guess, true, 1, false, 0, false, nil}
s.Split(s.ScanField)
return s
}
// ScanHeaders loads current line as the header line.
func (s *Reader) ScanHeaders() error {
s.Headers = make(map[string]int)
for i := 1; s.Scan(); i++ {
s.Headers[s.Text()] = i
if s.EndOfRecord() {
break
}
}
return s.Err()
}
// ScanRecordByName decodes one line fields by name (name1, value1, ...).
// Specified names must match Headers.
func (s *Reader) ScanRecordByName(args ...interface{}) (int, error) {
if len(args)%2 != 0 {
return 0, fmt.Errorf("expected an even number of arguments: %d", len(args))
}
values := make([]interface{}, len(s.Headers))
for i := 0; i < len(args); i += 2 {
name, ok := args[i].(string)
if !ok {
return 0, fmt.Errorf("non-string field name at %d: %T", i, args[i])
}
index, ok := s.Headers[name]
if !ok {
return 0, fmt.Errorf("unknown field name: %s", name)
}
values[index-1] = args[i+1]
}
return s.ScanRecord(values...)
}
// ScanRecord decodes one line fields to values.
// Empty lines are ignored/skipped.
// It's like fmt.Scan or database.sql.Rows.Scan.
// Returns (0, nil) on EOF, (*, err) on error
// and (n >= 1, nil) on success (n may be less or greater than len(values)).
// var n int
// var err error
// for {
// values := make([]string, N)
// if n, err = s.ScanRecord(&values[0]/*, &values[1], ...*/); err != nil || n == 0 {
// break // or error handling
// } else if (n > N) {
// n = N // ignore extra values
// }
// for _, value := range values[0:n] {
// // ...
// }
// }
// if err != nil {
// // error handling
// }
func (s *Reader) ScanRecord(values ...interface{}) (int, error) {
for i, value := range values {
if !s.Scan() {
return i, s.Err()
}
if i == 0 { // skip empty line (or line comment)
for s.EndOfRecord() && len(s.Bytes()) == 0 {
if !s.Scan() {
return i, s.Err()
}
}
}
if err := s.value(value, true); err != nil {
return i + 1, err
} else if s.EndOfRecord() && i != len(values)-1 {
return i + 1, nil
}
}
if !s.EndOfRecord() {
i := len(values)
for ; !s.EndOfRecord(); i++ { // Consume extra fields
if !s.Scan() {
return i, s.Err()
}
}
return i, nil
}
return len(values), nil
}
// ScanValue advances to the next token and decodes field's content to value.
// The value may point to data that will be overwritten by a subsequent call to Scan.
func (s *Reader) ScanValue(value interface{}) error {
if !s.Scan() {
return s.Err()
}
return s.value(value, false)
}
// Value decodes field's content to value.
// The value may point to data that will be overwritten by a subsequent call to Scan.
func (s *Reader) Value(value interface{}) error {
return s.value(value, false)
}
func (s *Reader) value(value interface{}, copied bool) error {
var err error
switch value := value.(type) {
case nil:
case *string:
*value = s.Text()
case *int:
*value, err = strconv.Atoi(s.Text())
case *int32:
var i int64
i, err = strconv.ParseInt(s.Text(), 10, 32)
*value = int32(i)
case *int64:
*value, err = strconv.ParseInt(s.Text(), 10, 64)
case *bool:
*value, err = strconv.ParseBool(s.Text())
case *float64:
*value, err = strconv.ParseFloat(s.Text(), 64)
case *[]byte:
if copied {
v := s.Bytes()
c := make([]byte, len(v))
copy(c, v)
*value = c
} else {
*value = s.Bytes()
}
case encoding.TextUnmarshaler:
err = value.UnmarshalText(s.Bytes())
default:
return s.scanReflect(value)
}
return err
}
func (s *Reader) scanReflect(v interface{}) (err error) {
rv := reflect.ValueOf(v)
if rv.Kind() != reflect.Ptr || rv.IsNil() {
return fmt.Errorf("unsupported type %T", v)
}
dv := reflect.Indirect(rv)
switch dv.Kind() {
case reflect.String:
dv.SetString(s.Text())
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
var i int64
i, err = strconv.ParseInt(s.Text(), 10, dv.Type().Bits())
if err == nil {
dv.SetInt(i)
}
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
var i uint64
i, err = strconv.ParseUint(s.Text(), 10, dv.Type().Bits())
if err == nil {
dv.SetUint(i)
}
case reflect.Bool:
var b bool
b, err = strconv.ParseBool(s.Text())
if err == nil {
dv.SetBool(b)
}
case reflect.Float32, reflect.Float64:
var f float64
f, err = strconv.ParseFloat(s.Text(), dv.Type().Bits())
if err == nil {
dv.SetFloat(f)
}
default:
return fmt.Errorf("unsupported type: %T", v)
}
return
}
// LineNumber returns current line number (not record number)
func (s *Reader) LineNumber() int {
return s.lineno
}
// EndOfRecord returns true when the most recent field has been terminated by a newline (not a separator).
func (s *Reader) EndOfRecord() bool {
return s.eor
}
// Sep returns the values separator used/guessed
func (s *Reader) Sep() byte {
return s.sep
}
// SkipRecords skips n records/headers
func (s *Reader) SkipRecords(n int) error {
i := 0
for {
if i == n {
return nil
}
if !s.Scan() {
return s.Err()
}
if s.eor {
i++
}
}
}
// ScanField implements bufio.SplitFunc for CSV.
// Lexing is adapted from csv_read_one_field function in SQLite3 shell sources.
func (s *Reader) ScanField(data []byte, atEOF bool) (advance int, token []byte, err error) {
var a int
for {
a, token, err = s.scanField(data, atEOF)
advance += a
if err != nil || a == 0 || token != nil {
return
}
data = data[a:]
}
}
func (s *Reader) scanField(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 && s.eor {
return 0, nil, nil
}
if s.guess {
s.guess = false
if b := guess(data); b > 0 {
s.sep = b
}
}
if s.quoted && len(data) > 0 && data[0] == '"' { // quoted field (may contains separator, newline and escaped quote)
startLineno := s.lineno
escapedQuotes := 0
strict := true
var c, pc, ppc byte
// Scan until the separator or newline following the closing quote (and ignore escaped quote)
for i := 1; i < len(data); i++ {
c = data[i]
if c == '\n' {
s.lineno++
} else if c == '"' {
if pc == c || pc == '\\' { // escaped quote
pc = 0
escapedQuotes++
continue
}
}
if pc == '"' && c == s.sep {
s.eor = false
return i + 1, unescapeQuotes(data[1:i-1], escapedQuotes, strict), nil
} else if pc == '"' && c == '\n' {
s.eor = true
return i + 1, unescapeQuotes(data[1:i-1], escapedQuotes, strict), nil
} else if c == '\n' && pc == '\r' && ppc == '"' {
s.eor = true
return i + 1, unescapeQuotes(data[1:i-2], escapedQuotes, strict), nil
}
if pc == '"' && c != '\r' {
if s.Lazy {
strict = false
} else {
return 0, nil, fmt.Errorf("unescaped %c character at line %d", pc, s.lineno)
}
}
ppc = pc
pc = c
}
if atEOF {
if c == '"' {
s.eor = true
return len(data), unescapeQuotes(data[1:len(data)-1], escapedQuotes, strict), nil
}
// If we're at EOF, we have a non-terminated field.
return 0, nil, fmt.Errorf("non-terminated quoted field between lines %d and %d", startLineno, s.lineno)
}
} else if s.eor && s.Comment != 0 && len(data) > 0 && data[0] == s.Comment { // line comment
for i, c := range data {
if c == '\n' {
s.lineno++
return i + 1, nil, nil
}
}
if atEOF {
return len(data), nil, nil
}
} else { // unquoted field
// Scan until separator or newline, marking end of field.
for i, c := range data {
if c == s.sep {
s.eor = false
if s.Trim {
return i + 1, trim(data[0:i]), nil
}
return i + 1, data[0:i], nil
} else if c == '\n' {
s.lineno++
if i > 0 && data[i-1] == '\r' {
s.eor = true
if s.Trim {
return i + 1, trim(data[0 : i-1]), nil
}
return i + 1, data[0 : i-1], nil
}
s.eor = true
if s.Trim {
return i + 1, trim(data[0:i]), nil
}
return i + 1, data[0:i], nil
}
}
// If we're at EOF, we have a final field. Return it.
if atEOF {
s.eor = true
if s.Trim {
return len(data), trim(data), nil
}
return len(data), data, nil
}
}
// Request more data.
return 0, nil, nil
}
func unescapeQuotes(b []byte, count int, strict bool) []byte {
if count == 0 {
return b
}
for i, j := 0, 0; i < len(b); i, j = i+1, j+1 {
if (b[i] == '"' || b[i] == '\\') && (strict || i < len(b)-1 && b[i+1] == '"') {
i++
}
b[j] = b[i]
}
return b[:len(b)-count]
}
func guess(data []byte) byte {
seps := []byte{',', ';', '\t', '|', ':'}
count := make(map[byte]uint)
for _, b := range data {
if bytes.IndexByte(seps, b) >= 0 {
count[b]++
/*} else if b == '\n' {
break*/
}
}
var max uint
var sep byte
for b, c := range count {
if c > max {
max = c
sep = b
}
}
return sep
}
// bytes.TrimSpace may return nil...
func trim(s []byte) []byte {
t := bytes.TrimSpace(s)
if t == nil {
return s[0:0]
}
return t
}
// IsNumber determines if the current token is a number or not.
// Only works for single-byte encodings (ASCII, ISO-8859-1) and UTF-8.
func (s *Reader) IsNumber() (isNum bool, isReal bool) {
return IsNumber(s.Bytes())
}
// Only works for single-byte encodings (ASCII, ISO-8859-1) and UTF-8.
func isDigit(c byte) bool {
return c >= '0' && c <= '9'
}
// IsNumber determines if the string is a number or not.
// Only works for single-byte encodings (ASCII, ISO-8859-1) and UTF-8.
func IsNumber(s []byte) (isNum bool, isReal bool) {
if len(s) == 0 {
return false, false
}
i := 0
if s[i] == '-' || s[i] == '+' { // sign
i++
}
// Nor Hexadecimal nor octal supported
digit := false
for ; len(s) != i && isDigit(s[i]); i++ {
digit = true
}
if len(s) == i { // integer "[-+]?\d*"
return digit, false
}
if s[i] == '.' { // real
for i++; len(s) != i && isDigit(s[i]); i++ { // digit(s) optional
digit = true
}
}
if len(s) == i { // real "[-+]?\d*\.\d*"
if digit {
return true, true
}
// "[-+]?\." is not a number
return false, false
}
if s[i] == 'e' || s[i] == 'E' { // exponent
i++
if !digit || len(s) == i { // nor "[-+]?\.?e" nor "[-+]?\d*\.?\d*e" is a number
return false, false
}
if s[i] == '-' || s[i] == '+' { // sign
i++
}
if len(s) == i || !isDigit(s[i]) { // one digit expected
return false, false
}
for i++; len(s) != i && isDigit(s[i]); i++ {
}
}
if len(s) == i {
return true, true
}
return false, false
}