Skip to content

Commit

Permalink
Sort output, refactor.
Browse files Browse the repository at this point in the history
  • Loading branch information
ctberthiaume committed May 22, 2018
1 parent fc9c8ba commit 9764f74
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 65 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.DS_Store
*.egg-info/
build/
dist/
.ipynb_checkpoints
Pipefile.lock
4 changes: 4 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#!/bin/sh

# Install pur and other dev packages
pipenv install --dev || exit
# Update requirements.txt
pipenv run pur -r requirements.txt || exit

# Build for PyPI
python setup.py sdist || exit
python setup.py bdist_wheel --universal || exit

Expand Down
1 change: 1 addition & 0 deletions opedia_dataset_validator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import absolute_import
from ._version import __version__
from . import cli
from . import error
from . import printer
from . import validator
2 changes: 1 addition & 1 deletion opedia_dataset_validator/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.1'
__version__ = '0.3.2'
50 changes: 50 additions & 0 deletions opedia_dataset_validator/error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import six
import sys

empty_str_val = '-'
empty_row_val = -1

defaults = {
'sheet': empty_str_val,
'column': empty_str_val,
'row': empty_row_val,
'message': empty_str_val,
'value': empty_str_val,
}

def error(e):
new_e = defaults.copy()
new_e.update(e)
if new_e['row'] != empty_row_val:
# Add two for header row and 1-based counting
new_e['row'] += 2
return new_e


def error_sort_key(e):
return [e[k] for k in ['sheet', 'column', 'row', 'message']]


def filter_first_seen(errors):
errors_seen = set()
filtered = []
for e in errors:
key = (e['sheet'], e['column'], e['message'])
if key in errors_seen:
continue
errors_seen.add(key)
filtered.append(e)
return filtered


def stringify(e):
e = e.copy()
if isinstance(e['row'], six.integer_types) and e['row'] == empty_row_val:
e['row'] = empty_str_val
if (sys.version_info > (3, 0)):
_str = str
else:
_str = unicode
for k in e:
e[k] = _str(e[k])
return e
40 changes: 10 additions & 30 deletions opedia_dataset_validator/printer.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,24 @@
import click
import sys
from . import error

def print_tsv_errors(errors, fh, print_all=True, header=True, print_value=False):
fill_error_fields(errors)
fix_row_number(errors)
errors_output = []
errors_seen = set()
for e in errors:
key = (e['sheet'], e['column'], e['message'])
if key in errors_seen and not print_all:
continue
errors_seen.add(key)
errors_output.append(e)
errors = sorted(errors, key=error.error_sort_key)

if not print_all:
errors = error.filter_first_seen(errors)

if print_value:
outkeys = ['sheet', 'column', 'row', 'value', 'message']
else:
outkeys = ['sheet', 'column', 'row', 'message']
for e in errors_output:
_ = e.pop('value', None)

outlines = []
if header:
outlines.append('#%s' % '\t'.join(outkeys))

for e in errors_output:
# Convert all values to unicode strings and concatenate before output
if (sys.version_info > (3, 0)):
outlines.append('\t'.join([str(e[k]) for k in outkeys]))
else:
outlines.append('\t'.join([unicode(e[k]) for k in outkeys]))
fh.write('\n'.join(outlines) + '\n')


def fill_error_fields(errors):
for e in errors:
for k in ['column', 'message', 'row', 'sheet', 'value']:
e.setdefault(k, None)

# Convert all values to unicode strings and concatenate before output
e = error.stringify(e)
outlines.append('\t'.join([e[k] for k in outkeys]))

def fix_row_number(errors):
for e in errors:
if e.get('row', None) is not None:
e['row'] += 2 # add two for header row and 1-based counting
fh.write('\n'.join(outlines) + '\n')
69 changes: 35 additions & 34 deletions opedia_dataset_validator/validator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import unicode_literals
from .error import error
from io import open
import arrow
import os
Expand Down Expand Up @@ -37,21 +38,21 @@ def validate_column_datetimes(series, colspec, sheet):
try:
dt = arrow.get(val, colspec['format'])
except ValueError as e:
errors.append({
errors.append(error({
'message': 'error in datetime string: %s' % e,
'value': val,
'row': idx,
'column': series.name,
'sheet': sheet
})
}))
except arrow.parser.ParserError as e:
errors.append({
errors.append(error({
'message': 'invalid datetime string - should match %s' % colspec['format'],
'value': val,
'row': idx,
'column': series.name,
'sheet': sheet
})
}))

return errors

Expand All @@ -69,36 +70,36 @@ def validate_column_floats(series, colspec, sheet):
# Flag NaN as errors
nonnumeric_errors = series[pd.isna(converted)]
for idx, val in nonnumeric_errors.iteritems():
errors.append({
errors.append(error({
'message': 'invalid value',
'value': val,
'row': idx,
'column': series.name,
'sheet': sheet
})
}))
# Check range
min_errors = None
max_errors = None
if colspec.get('min', False):
min_errors = series[converted < colspec['min']]
for idx, val in min_errors.iteritems():
errors.append({
errors.append(error({
'message': 'value less than minimum of {}'.format(colspec['min']),
'value': val,
'row': idx,
'column': series.name,
'sheet': sheet
})
}))
if colspec.get('max', False):
max_errors = series[converted > colspec['max']]
for idx, val in max_errors.iteritems():
errors.append({
errors.append(error({
'message': 'value greater than maximum of {}'.format(colspec['max']),
'value': val,
'row': idx,
'column': series.name,
'sheet': sheet
})
}))

return errors

Expand All @@ -120,12 +121,12 @@ def validate_column_generic(series, colspec, sheet):
# valid values. Flag as errors.
empty_errors = series[series.str.len() == 0]
for idx, val in empty_errors.iteritems():
errors.append({
errors.append(error({
'message': 'missing required field',
'row': idx,
'column': series.name,
'sheet': sheet
})
}))
# Now remove empty cells
series = series[series.str.len() > 0]
if na is not None:
Expand All @@ -144,13 +145,13 @@ def validate_column_strings(series, colspec, sheet):
if colspec.get('max', False):
maxlen_errors = series[series.str.len() >= colspec['max']]
for idx, val in maxlen_errors.iteritems():
errors.append({
errors.append(error({
'message': 'string length > %d' % colspec['max'],
'value': val,
'row': idx,
'column': series.name,
'sheet': sheet
})
}))

return errors

Expand All @@ -161,28 +162,28 @@ def validate_filename(input_path):
filename_re = re.compile(r'^(?P<shortname>.+)_(?P<date>[^_]+)_(?P<version>[^_]+)\.xlsx$')
m = filename_re.match(fn)
if not m:
errors.append({
errors.append(error({
'message': 'filename does not match format <dataset_short_name>_<dataset_release_date>_v<dataset_version>.xlxs',
'value': fn
})
}))
else:
try:
dt = arrow.get(m.group('date'), spec['file_date'])
except ValueError as e:
errors.append({
errors.append(error({
'message': 'error in filename datatime string: %s' % e,
'value': m.group('date')
})
}))
except arrow.parser.ParserError as e:
errors.append({
errors.append(error({
'message': 'date in filename must be in %s format' % spec['file_date'],
'value': m.group('date')
})
}))
if not re.match(r'^v.+$', m.group('version')):
errors.append({
errors.append(error({
'message': 'version string in filename must start with "v"',
'value': fn
})
}))
return errors


Expand All @@ -197,9 +198,9 @@ def validate_sheet_data(wb):
required_columns = list(spec['columns']['data'].keys())
if len(df.columns.tolist()) < len(required_columns) or \
df.columns.tolist()[0:len(required_columns)] != required_columns:
errors.append({
errors.append(error({
'message': 'the first %d columns of "%s" worksheet should be %s' % (len(required_columns), spec['sheets']['data'], required_columns)
})
}))
return errors
# Validate cells for required columns
for colname, colspec in spec['columns']['data'].items():
Expand All @@ -215,15 +216,15 @@ def validate_sheet_data(wb):
extra_defined = set(vars_defined).difference(set(vars_found))
extra_found = set(vars_found).difference(set(vars_defined))
if extra_defined:
errors.append({
errors.append(error({
'message': 'some data variables were defined in the "%s" worksheet but were not found in the "%s" worksheet' % (spec['sheets']['vars'], spec['sheets']['data']),
'value': ', '.join(extra_defined)
})
}))
if extra_found:
errors.append({
errors.append(error({
'message': 'some data variables were found in the "%s" worksheet but were not defined in the "%s" worksheet' % (spec['sheets']['data'], spec['sheets']['vars']),
'value': ', '.join(extra_found)
})
}))

# Now validate the actual data only on the condition of
# proper missing values.
Expand All @@ -247,10 +248,10 @@ def validate_sheet_metadata(wb):
required_columns = list(spec['columns']['metadata'].keys())
df = wb[spec['sheets']['metadata']]
if df.columns.tolist() != required_columns:
errors.append({
errors.append(error({
'message': 'incorrect set or order of columns in the "%s" worksheet, expected %s' % (spec['sheets']['metadata'], required_columns),
'value': str(df.columns.tolist())
})
}))
return errors

# Validate cells
Expand All @@ -269,10 +270,10 @@ def validate_sheet_vars(wb):
required_columns = list(spec['columns']['vars'].keys())
df = wb[spec['sheets']['vars']]
if df.columns.tolist() != required_columns:
errors.append({
errors.append(error({
'message': 'incorrect set or order of columns in "%s" worksheet, expected %s' % (spec['sheets']['vars'], required_columns),
'value': str(df.columns.tolist())
})
}))
return errors

# Validate cells
Expand All @@ -287,10 +288,10 @@ def validate_all_sheets_present(wb):
errors = []
sheets = [spec['sheets']['data'], spec['sheets']['metadata'], spec['sheets']['vars']]
if list(wb.keys()) != sheets:
errors.append({
errors.append(error({
'message': 'spreadsheet should contain 3 worksheets: %s' % sheets,
'value': str(list(wb.keys()))
})
}))
return errors


Expand Down

0 comments on commit 9764f74

Please sign in to comment.