This repository has been archived by the owner on Jul 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 42
/
csv.py
127 lines (104 loc) · 3.48 KB
/
csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
import csv
import six
from itertools import chain
from codecs import iterencode
from ..parser import Parser
from .. import helpers
from .. import config
# Module API
class CSVParser(Parser):
"""Parser to parse CSV data format.
"""
# Public
options = [
'delimiter',
'doublequote',
'escapechar',
'quotechar',
'quoting',
'skipinitialspace',
'lineterminator'
]
def __init__(self, loader, force_parse=False, **options):
# Make bytes
if six.PY2:
for key, value in options.items():
if isinstance(value, six.string_types):
options[key] = str(value)
# Set attributes
self.__loader = loader
self.__options = options
self.__force_parse = force_parse
self.__extended_rows = None
self.__encoding = None
self.__chars = None
@property
def closed(self):
return self.__chars is None or self.__chars.closed
def open(self, source, encoding=None):
self.close()
self.__chars = self.__loader.load(source, encoding=encoding)
self.__encoding = getattr(self.__chars, 'encoding', encoding)
if self.__encoding:
self.__encoding.lower()
self.reset()
def close(self):
if not self.closed:
self.__chars.close()
def reset(self):
helpers.reset_stream(self.__chars)
self.__extended_rows = self.__iter_extended_rows()
@property
def encoding(self):
return self.__encoding
@property
def extended_rows(self):
return self.__extended_rows
# Private
def __iter_extended_rows(self):
# For PY2 encode/decode
if six.PY2:
# Reader requires utf-8 encoded stream
bytes = iterencode(self.__chars, 'utf-8')
sample, dialect = self.__prepare_dialect(bytes)
items = csv.reader(chain(sample, bytes), dialect=dialect)
for row_number, item in enumerate(items, start=1):
values = []
for value in item:
value = value.decode('utf-8')
values.append(value)
yield (row_number, None, list(values))
# For PY3 use chars
else:
sample, dialect = self.__prepare_dialect(self.__chars)
items = csv.reader(chain(sample, self.__chars), dialect=dialect)
for row_number, item in enumerate(items, start=1):
yield (row_number, None, list(item))
def __prepare_dialect(self, stream):
# Get sample
sample = []
while True:
try:
sample.append(next(stream))
except StopIteration:
break
if len(sample) >= config.CSV_SAMPLE_LINES:
break
# Get dialect
try:
separator = b'' if six.PY2 else ''
delimiter = self.__options.get('delimiter', ',\t;|')
dialect = csv.Sniffer().sniff(separator.join(sample), delimiter)
if not dialect.escapechar:
dialect.doublequote = True
except csv.Error:
class dialect(csv.excel):
pass
for key, value in self.__options.items():
setattr(dialect, key, value)
return sample, dialect