Merge branch 'master' of github.com:okfn/goodtables

frictionlessdata · Mar 28, 2016 · 6e0b115 · 6e0b115
2 parents 035f8a2 + f999a2b
commit 6e0b115
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 57 deletions.
diff --git a/examples/test_schema.json b/examples/test_schema.json
@@ -4,19 +4,28 @@
             "name": "id",
             "title": "ID",
             "type": "integer",
-            "description": "The id."
+            "description": "The id.",
+            "constraints": {
+                "required": true
+            }
         },
         {
             "name": "name",
             "title": "Name",
             "type": "string",
-            "description": "The name."
+            "description": "The name.",
+            "constraints": {
+                "required": true
+            }
         },
         {
             "name": "age",
             "title": "Age",
             "type": "integer",
-            "description": "The age."
+            "description": "The age.",
+            "constraints": {
+                "required": true
+            }
         }
     ],
     "primaryKey": "id"

diff --git a/goodtables/compat.py b/goodtables/compat.py
@@ -8,6 +8,7 @@
 import tempfile
 import io
 import csv
+from itertools import islice, chain
 
 
 _ver = sys.version_info
@@ -30,11 +31,21 @@
     def csv_reader(data, dialect=csv.excel, **kwargs):
         """Read text stream (unicode on Py2.7) as CSV."""
 
+        first_lines = list(islice(data, 10))
+        try:
+            dialect = csv.Sniffer().sniff(''.join(first_lines))
+            dialect.delimiter = dialect.delimiter.encode('utf-8')
+            dialect.quotechar = dialect.quotechar.encode('utf-8')
+        except csv.Error:
+            dialect = csv.excel
+
         def iterenc_utf8(data):
             for line in data:
                 yield line.encode('utf-8')
 
-        reader = csv.reader(iterenc_utf8(data), dialect=dialect, **kwargs)
+        iter = chain(first_lines, data)
+        iter = iterenc_utf8(iter)
+        reader = csv.reader(iter, dialect=dialect, **kwargs)
         for row in reader:
             yield [str(cell, 'utf-8') for cell in row]
 
@@ -43,13 +54,26 @@ def iterenc_utf8(data):
     from urllib import parse
     from urllib.request import urlopen
     from urllib.error import HTTPError
-    csv_reader = csv.reader
     builtin_str = str
     str = str
     bytes = bytes
     basestring = (str, bytes)
     numeric_types = (int, float)
 
+    def csv_reader(data, **kwargs):
+        def line_iterator(data):
+            for line in data:
+                yield line
+        iter = line_iterator(data)
+        first_lines = list(islice(iter, 10))
+        try:
+            dialect = csv.Sniffer().sniff(''.join(first_lines))
+        except csv.Error:
+            dialect = csv.excel
+        iter = chain(first_lines, iter)
+        return csv.reader(iter, dialect, **kwargs)
+
+
 
 def to_bytes(textstring):
     """Convert a text string to a byte string"""

diff --git a/goodtables/datatable/datatable.py b/goodtables/datatable/datatable.py
@@ -62,9 +62,9 @@ def replay(self):
 
     def extract(self, headers=None):
         """Extract headers and values from the data stream."""
-        headers = headers or self.get_headers(self.stream)
-        values = compat.csv_reader(self.stream)
-        return headers, values
+        reader = compat.csv_reader(self.stream)
+        headers = headers or self.get_headers(self.stream, reader)
+        return headers, reader
 
     def get_sample(self, row_limit):
         """Get a sample of data, as a CSV reader, up to a max of `row_limit`."""
@@ -118,15 +118,16 @@ def to_textstream(self, data_source):
 
                 return textstream
 
-        elif compat.parse.urlparse(data_source).scheme in self.REMOTE_SCHEMES:
+        elif isinstance(data_source, compat.str) and \
+                        compat.parse.urlparse(data_source).scheme in self.REMOTE_SCHEMES:
 
             stream = self._stream_from_url(data_source)
             self.encoding = self._detect_stream_encoding(stream)
             textstream = self._decode_to_textstream(stream, self.encoding, textstream)
 
             return textstream
 
-        elif isinstance(data_source, compat.str) and not \
+        elif (isinstance(data_source, compat.str) or isinstance(data_source, compat.bytes)) and not \
                 os.path.exists(data_source):
 
             self.encoding = self._detect_stream_encoding(data_source)
@@ -190,10 +191,11 @@ def excel_data_source(self, data_source):
         out.seek(0)
         return out
 
-    def get_headers(self, stream):
+    def get_headers(self, stream, reader = None):
         """Get headers from stream."""
 
-        reader = compat.csv_reader(stream)
+        if reader is None:
+            reader = compat.csv_reader(stream)
         for index, line in enumerate(reader):
             if index == self.header_index:
                 headers = line
@@ -225,10 +227,9 @@ def _detect_stream_encoding(self, stream):
             return self.passed_encoding
 
         if isinstance(stream, compat.str):
-            if isinstance(stream, compat.bytes):
-                sample = stream[:sample_length]
-            else:
-                sample = compat.to_bytes(stream)[:sample_length]
+            sample = compat.to_bytes(stream)[:sample_length]
+        elif isinstance(stream, compat.bytes):
+            sample = stream[:sample_length]
         else:
             sample = stream.read(sample_length)
             stream.seek(0)
@@ -243,7 +244,9 @@ def _detect_stream_encoding(self, stream):
     def _decode_to_textstream(self, stream, encoding, textstream):
         """Return a textstream in `self.DEFAULT_ENCODING`"""
 
-        if isinstance(stream, compat.str):
+        if isinstance(stream, compat.bytes):
+            stream = codecs.iterdecode([stream], encoding, self.decode_strategy)
+        elif isinstance(stream, compat.str):
             _stream = io.StringIO()
             _stream.write(stream)
             stream = _stream
@@ -252,7 +255,6 @@ def _decode_to_textstream(self, stream, encoding, textstream):
             stream = codecs.iterdecode(stream, encoding, self.decode_strategy)
 
         try:
-
             for line in stream:
                 recoded = line.encode(self.DEFAULT_ENCODING).decode(self.DEFAULT_ENCODING)
                 textstream.write(recoded)

diff --git a/goodtables/processors/schema.py b/goodtables/processors/schema.py
@@ -35,14 +35,14 @@
     'schema_004': {
         'id': 'schema_004',
         'name': 'Required Field',
-        'msg': 'Column {0} is a required field, but no value can be found in row {1}.',
+        'msg': 'Column {0} is a required field, but it contains no value.',
         'help': '',
         'help_edit': ''
     },
     'schema_005': {
         'id': 'schema_005',
         'name': 'Non-Required Field (Empty/Null)',
-        'msg': 'Column {0} is a non-required field, and has a null value in row {1}.',
+        'msg': 'Column {0} is a non-required field, and has a null value.',
         'help': '',
         'help_edit': ''
     },
@@ -221,7 +221,8 @@ def run_row(self, headers, index, row):
                         # CONSTRAINTS
                         constraints = self.schema.get_constraints(column_name)
 
-                        if constraints['required'] is True and \
+                        if constraints is not None and \
+                           constraints.get('required', False) is True and \
                            (column_value in self.schema.NULL_VALUES):
 
                             valid = False
@@ -230,7 +231,7 @@ def run_row(self, headers, index, row):
                                 self.name,
                                 self.RESULT_CATEGORY_ROW,
                                 self.RESULT_LEVEL_ERROR,
-                                _type['msg'].format(column_name, index),
+                                _type['msg'].format(column_name),
                                 _type['id'],
                                 _type['name'],
                                 row,
@@ -244,7 +245,8 @@ def run_row(self, headers, index, row):
                             if self.fail_fast:
                                 return valid, headers, index, row
 
-                        if constraints['required'] is False and \
+                        if constraints is not None and \
+                           constraints.get('required', False) is False and \
                            (column_value in self.schema.NULL_VALUES) and \
                            self.result_level == self.RESULT_LEVEL_INFO:
                             # add info result
@@ -253,7 +255,7 @@ def run_row(self, headers, index, row):
                                 self.name,
                                 self.RESULT_CATEGORY_ROW,
                                 self.RESULT_LEVEL_INFO,
-                                _type['msg'].format(column_name, index),
+                                _type['msg'].format(column_name),
                                 _type['id'],
                                 _type['name'],
                                 row,

diff --git a/goodtables/processors/structure.py b/goodtables/processors/structure.py
@@ -11,35 +11,35 @@
     'structure_001': {
         'id': 'structure_001',
         'name': 'Missing Header',
-        'msg': 'Column {0} is missing a header.',
+        'msg': 'Headers column is empty.',
         'help': '',
         'help_edit': ''
     },
     'structure_002': {
         'id': 'structure_002',
         'name': 'Duplicate Header',
-        'msg': 'The header in column {0} was found to have duplicates.',
+        'msg': 'A header column is duplicated.',
         'help': '',
         'help_edit': ''
     },
     'structure_003': {
         'id': 'structure_003',
         'name': 'Defective Row',
-        'msg': 'Row {0} is defective: the dimensions are incorrect compared to headers.',
+        'msg': 'The row dimensions are incorrect compared to headers.',
         'help': '',
         'help_edit': ''
     },
     'structure_004': {
         'id': 'structure_004',
         'name': 'Duplicate Row',
-        'msg': 'Row {0} duplicates the following rows which have already been seen: {1}.',
+        'msg': 'The exact same row has been seen before (a duplicate).',
         'help': '',
         'help_edit': ''
     },
     'structure_005': {
         'id': 'structure_005',
         'name': 'Empty Row',
-        'msg': 'Row {0} is empty.',
+        'msg': 'Row is empty.',
         'help': '',
         'help_edit': ''
     }
@@ -95,7 +95,7 @@ def run_header(self, headers, header_index=0):
                         self.name,
                         self.RESULT_CATEGORY_HEADER,
                         self.RESULT_LEVEL_ERROR,
-                        _type['msg'].format(index),
+                        _type['msg'],
                         _type['id'],
                         _type['name'],
                         headers,
@@ -124,7 +124,7 @@ def run_header(self, headers, header_index=0):
                         self.name,
                         self.RESULT_CATEGORY_HEADER,
                         self.RESULT_LEVEL_ERROR,
-                        _type['msg'].format(dupe[0]),
+                        _type['msg'],
                         _type['id'],
                         _type['name'],
                         headers,
@@ -165,7 +165,7 @@ def run_row(self, headers, index, row):
                         self.name,
                         self.RESULT_CATEGORY_ROW,
                         self.RESULT_LEVEL_ERROR,
-                        _type['msg'].format(index, previous_instances),
+                        _type['msg'],
                         _type['id'],
                         _type['name'],
                         row,
@@ -192,7 +192,7 @@ def run_row(self, headers, index, row):
                     self.name,
                     self.RESULT_CATEGORY_ROW,
                     self.RESULT_LEVEL_ERROR,
-                    _type['msg'].format(index),
+                    _type['msg'],
                     _type['id'],
                     _type['name'],
                     row,
@@ -206,7 +206,7 @@ def run_row(self, headers, index, row):
 
         # check if row is defective
         if not self.ignore_defective_rows:
-            if len(headers) < len(row):
+            if len(headers) != len(row):
 
                 valid = False
                 is_defective = True
@@ -215,28 +215,7 @@ def run_row(self, headers, index, row):
                     self.name,
                     self.RESULT_CATEGORY_ROW,
                     self.RESULT_LEVEL_ERROR,
-                    _type['msg'].format(index),
-                    _type['id'],
-                    _type['name'],
-                    row,
-                    index,
-                    row_name
-                )
-
-                self.report.write(entry)
-                if self.fail_fast:
-                    return valid, headers, index, row
-
-            elif len(headers) < len(row):
-
-                valid = False
-                is_defective = True
-                _type = RESULTS['structure_003']
-                entry = self.make_entry(
-                    self.name,
-                    self.RESULT_CATEGORY_ROW,
-                    self.RESULT_LEVEL_ERROR,
-                    _type['msg'].format(index),
+                    _type['msg'],
                     _type['id'],
                     _type['name'],
                     row,

diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@
     name='goodtables',
     description='Validate and process tabular data.',
     long_description='Validate and process tabular data.',
-    version='0.6.1',
+    version='0.6.2',
     author='Open Knowledge Foundation',
     author_email='info@okfn.org',
     url='http://okfn.org',