Skip to content

Commit

Permalink
Merge branch 'master' of github.com:okfn/goodtables
Browse files Browse the repository at this point in the history
  • Loading branch information
pwalsh committed Mar 28, 2016
2 parents 035f8a2 + f999a2b commit 6e0b115
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 57 deletions.
15 changes: 12 additions & 3 deletions examples/test_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,28 @@
"name": "id",
"title": "ID",
"type": "integer",
"description": "The id."
"description": "The id.",
"constraints": {
"required": true
}
},
{
"name": "name",
"title": "Name",
"type": "string",
"description": "The name."
"description": "The name.",
"constraints": {
"required": true
}
},
{
"name": "age",
"title": "Age",
"type": "integer",
"description": "The age."
"description": "The age.",
"constraints": {
"required": true
}
}
],
"primaryKey": "id"
Expand Down
28 changes: 26 additions & 2 deletions goodtables/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import tempfile
import io
import csv
from itertools import islice, chain


_ver = sys.version_info
Expand All @@ -30,11 +31,21 @@
def csv_reader(data, dialect=csv.excel, **kwargs):
"""Read text stream (unicode on Py2.7) as CSV."""

first_lines = list(islice(data, 10))
try:
dialect = csv.Sniffer().sniff(''.join(first_lines))
dialect.delimiter = dialect.delimiter.encode('utf-8')
dialect.quotechar = dialect.quotechar.encode('utf-8')
except csv.Error:
dialect = csv.excel

def iterenc_utf8(data):
for line in data:
yield line.encode('utf-8')

reader = csv.reader(iterenc_utf8(data), dialect=dialect, **kwargs)
iter = chain(first_lines, data)
iter = iterenc_utf8(iter)
reader = csv.reader(iter, dialect=dialect, **kwargs)
for row in reader:
yield [str(cell, 'utf-8') for cell in row]

Expand All @@ -43,13 +54,26 @@ def iterenc_utf8(data):
from urllib import parse
from urllib.request import urlopen
from urllib.error import HTTPError
csv_reader = csv.reader
builtin_str = str
str = str
bytes = bytes
basestring = (str, bytes)
numeric_types = (int, float)

def csv_reader(data, **kwargs):
def line_iterator(data):
for line in data:
yield line
iter = line_iterator(data)
first_lines = list(islice(iter, 10))
try:
dialect = csv.Sniffer().sniff(''.join(first_lines))
except csv.Error:
dialect = csv.excel
iter = chain(first_lines, iter)
return csv.reader(iter, dialect, **kwargs)



def to_bytes(textstring):
"""Convert a text string to a byte string"""
Expand Down
28 changes: 15 additions & 13 deletions goodtables/datatable/datatable.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ def replay(self):

def extract(self, headers=None):
"""Extract headers and values from the data stream."""
headers = headers or self.get_headers(self.stream)
values = compat.csv_reader(self.stream)
return headers, values
reader = compat.csv_reader(self.stream)
headers = headers or self.get_headers(self.stream, reader)
return headers, reader

def get_sample(self, row_limit):
"""Get a sample of data, as a CSV reader, up to a max of `row_limit`."""
Expand Down Expand Up @@ -118,15 +118,16 @@ def to_textstream(self, data_source):

return textstream

elif compat.parse.urlparse(data_source).scheme in self.REMOTE_SCHEMES:
elif isinstance(data_source, compat.str) and \
compat.parse.urlparse(data_source).scheme in self.REMOTE_SCHEMES:

stream = self._stream_from_url(data_source)
self.encoding = self._detect_stream_encoding(stream)
textstream = self._decode_to_textstream(stream, self.encoding, textstream)

return textstream

elif isinstance(data_source, compat.str) and not \
elif (isinstance(data_source, compat.str) or isinstance(data_source, compat.bytes)) and not \
os.path.exists(data_source):

self.encoding = self._detect_stream_encoding(data_source)
Expand Down Expand Up @@ -190,10 +191,11 @@ def excel_data_source(self, data_source):
out.seek(0)
return out

def get_headers(self, stream):
def get_headers(self, stream, reader = None):
"""Get headers from stream."""

reader = compat.csv_reader(stream)
if reader is None:
reader = compat.csv_reader(stream)
for index, line in enumerate(reader):
if index == self.header_index:
headers = line
Expand Down Expand Up @@ -225,10 +227,9 @@ def _detect_stream_encoding(self, stream):
return self.passed_encoding

if isinstance(stream, compat.str):
if isinstance(stream, compat.bytes):
sample = stream[:sample_length]
else:
sample = compat.to_bytes(stream)[:sample_length]
sample = compat.to_bytes(stream)[:sample_length]
elif isinstance(stream, compat.bytes):
sample = stream[:sample_length]
else:
sample = stream.read(sample_length)
stream.seek(0)
Expand All @@ -243,7 +244,9 @@ def _detect_stream_encoding(self, stream):
def _decode_to_textstream(self, stream, encoding, textstream):
"""Return a textstream in `self.DEFAULT_ENCODING`"""

if isinstance(stream, compat.str):
if isinstance(stream, compat.bytes):
stream = codecs.iterdecode([stream], encoding, self.decode_strategy)
elif isinstance(stream, compat.str):
_stream = io.StringIO()
_stream.write(stream)
stream = _stream
Expand All @@ -252,7 +255,6 @@ def _decode_to_textstream(self, stream, encoding, textstream):
stream = codecs.iterdecode(stream, encoding, self.decode_strategy)

try:

for line in stream:
recoded = line.encode(self.DEFAULT_ENCODING).decode(self.DEFAULT_ENCODING)
textstream.write(recoded)
Expand Down
14 changes: 8 additions & 6 deletions goodtables/processors/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@
'schema_004': {
'id': 'schema_004',
'name': 'Required Field',
'msg': 'Column {0} is a required field, but no value can be found in row {1}.',
'msg': 'Column {0} is a required field, but it contains no value.',
'help': '',
'help_edit': ''
},
'schema_005': {
'id': 'schema_005',
'name': 'Non-Required Field (Empty/Null)',
'msg': 'Column {0} is a non-required field, and has a null value in row {1}.',
'msg': 'Column {0} is a non-required field, and has a null value.',
'help': '',
'help_edit': ''
},
Expand Down Expand Up @@ -221,7 +221,8 @@ def run_row(self, headers, index, row):
# CONSTRAINTS
constraints = self.schema.get_constraints(column_name)

if constraints['required'] is True and \
if constraints is not None and \
constraints.get('required', False) is True and \
(column_value in self.schema.NULL_VALUES):

valid = False
Expand All @@ -230,7 +231,7 @@ def run_row(self, headers, index, row):
self.name,
self.RESULT_CATEGORY_ROW,
self.RESULT_LEVEL_ERROR,
_type['msg'].format(column_name, index),
_type['msg'].format(column_name),
_type['id'],
_type['name'],
row,
Expand All @@ -244,7 +245,8 @@ def run_row(self, headers, index, row):
if self.fail_fast:
return valid, headers, index, row

if constraints['required'] is False and \
if constraints is not None and \
constraints.get('required', False) is False and \
(column_value in self.schema.NULL_VALUES) and \
self.result_level == self.RESULT_LEVEL_INFO:
# add info result
Expand All @@ -253,7 +255,7 @@ def run_row(self, headers, index, row):
self.name,
self.RESULT_CATEGORY_ROW,
self.RESULT_LEVEL_INFO,
_type['msg'].format(column_name, index),
_type['msg'].format(column_name),
_type['id'],
_type['name'],
row,
Expand Down
43 changes: 11 additions & 32 deletions goodtables/processors/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,35 @@
'structure_001': {
'id': 'structure_001',
'name': 'Missing Header',
'msg': 'Column {0} is missing a header.',
'msg': 'Headers column is empty.',
'help': '',
'help_edit': ''
},
'structure_002': {
'id': 'structure_002',
'name': 'Duplicate Header',
'msg': 'The header in column {0} was found to have duplicates.',
'msg': 'A header column is duplicated.',
'help': '',
'help_edit': ''
},
'structure_003': {
'id': 'structure_003',
'name': 'Defective Row',
'msg': 'Row {0} is defective: the dimensions are incorrect compared to headers.',
'msg': 'The row dimensions are incorrect compared to headers.',
'help': '',
'help_edit': ''
},
'structure_004': {
'id': 'structure_004',
'name': 'Duplicate Row',
'msg': 'Row {0} duplicates the following rows which have already been seen: {1}.',
'msg': 'The exact same row has been seen before (a duplicate).',
'help': '',
'help_edit': ''
},
'structure_005': {
'id': 'structure_005',
'name': 'Empty Row',
'msg': 'Row {0} is empty.',
'msg': 'Row is empty.',
'help': '',
'help_edit': ''
}
Expand Down Expand Up @@ -95,7 +95,7 @@ def run_header(self, headers, header_index=0):
self.name,
self.RESULT_CATEGORY_HEADER,
self.RESULT_LEVEL_ERROR,
_type['msg'].format(index),
_type['msg'],
_type['id'],
_type['name'],
headers,
Expand Down Expand Up @@ -124,7 +124,7 @@ def run_header(self, headers, header_index=0):
self.name,
self.RESULT_CATEGORY_HEADER,
self.RESULT_LEVEL_ERROR,
_type['msg'].format(dupe[0]),
_type['msg'],
_type['id'],
_type['name'],
headers,
Expand Down Expand Up @@ -165,7 +165,7 @@ def run_row(self, headers, index, row):
self.name,
self.RESULT_CATEGORY_ROW,
self.RESULT_LEVEL_ERROR,
_type['msg'].format(index, previous_instances),
_type['msg'],
_type['id'],
_type['name'],
row,
Expand All @@ -192,7 +192,7 @@ def run_row(self, headers, index, row):
self.name,
self.RESULT_CATEGORY_ROW,
self.RESULT_LEVEL_ERROR,
_type['msg'].format(index),
_type['msg'],
_type['id'],
_type['name'],
row,
Expand All @@ -206,7 +206,7 @@ def run_row(self, headers, index, row):

# check if row is defective
if not self.ignore_defective_rows:
if len(headers) < len(row):
if len(headers) != len(row):

valid = False
is_defective = True
Expand All @@ -215,28 +215,7 @@ def run_row(self, headers, index, row):
self.name,
self.RESULT_CATEGORY_ROW,
self.RESULT_LEVEL_ERROR,
_type['msg'].format(index),
_type['id'],
_type['name'],
row,
index,
row_name
)

self.report.write(entry)
if self.fail_fast:
return valid, headers, index, row

elif len(headers) < len(row):

valid = False
is_defective = True
_type = RESULTS['structure_003']
entry = self.make_entry(
self.name,
self.RESULT_CATEGORY_ROW,
self.RESULT_LEVEL_ERROR,
_type['msg'].format(index),
_type['msg'],
_type['id'],
_type['name'],
row,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
name='goodtables',
description='Validate and process tabular data.',
long_description='Validate and process tabular data.',
version='0.6.1',
version='0.6.2',
author='Open Knowledge Foundation',
author_email='info@okfn.org',
url='http://okfn.org',
Expand Down

0 comments on commit 6e0b115

Please sign in to comment.