Permalink
Browse files

First pass at accumulating errors when normalizing a table (PANDA).

  • Loading branch information...
onyxfish committed Sep 30, 2011
1 parent b12f9dd commit 4b36724f9f3557aa3e6234df3c62f5172187b1df
Showing with 94 additions and 18 deletions.
  1. +11 −0 csvkit/exceptions.py
  2. +35 −10 csvkit/typeinference.py
  3. +48 −8 tests/test_typeinference.py
View
@@ -74,3 +74,14 @@ def __init__(self, index, value, normal_type):
self.normal_type = normal_type
msg = 'Unable to convert "%s" to type %s (at index %i)' % (value, normal_type, index)
super(InvalidValueForTypeException, self).__init__(msg)
+
+class InvalidValueForTypeListException(CustomException):
+ """
+ Exception raised when one or more InvalidValueForTypeException
+ has been raised while accumulating errors.
+ """
+ def __init__(self, errors):
+ self.errors = errors
+ msg = 'Encountered errors converting values in %i columns' % len(errors)
+ super(InvalidValueForTypeListException, self).__init__(msg)
+
View
@@ -5,7 +5,7 @@
from dateutil.parser import parse
-from exceptions import InvalidValueForTypeException
+from exceptions import InvalidValueForTypeException, InvalidValueForTypeListException
NULL_VALUES = ('na', 'n/a', 'none', 'null', '.')
TRUE_VALUES = ('yes', 'y', 'true', 't')
@@ -171,23 +171,48 @@ def normalize_column_type(l, normal_type=None):
# Don't know what they are, so they must just be strings
return unicode, [x if x != '' else None for x in l]
-def normalize_table(rows, column_count):
+def normalize_table(rows, normal_types=None, accumulate_errors=False):
"""
Given a sequence of sequences, normalize the lot.
+
+ Optionally accepts a normal_types parameter which is a list of
+ types that the columns must normalize to.
"""
- data_columns = [[] for x in range(column_count)]
+ data_columns = []
+ column_count = 0
+ row_count = 0
for row in rows:
+ while column_count < len(row):
+ data_columns.append([None] * row_count)
+ column_count += 1
+
for data_column, value in zip(data_columns, row):
data_column.append(value)
- normal_types = []
- normal_columns= []
+ row_count += 1
+
+ new_normal_types = []
+ new_normal_columns= []
+ errors = {}
- for column in data_columns:
- t, c = normalize_column_type(column)
- normal_types.append(t)
- normal_columns.append(c)
+ for i, column in enumerate(data_columns):
+ try:
+ if normal_types:
+ t, c = normalize_column_type(column, normal_types[i])
+ else:
+ t, c = normalize_column_type(column)
+
+ new_normal_types.append(t)
+ new_normal_columns.append(c)
+ except InvalidValueForTypeException, e:
+ if not accumulate_errors:
+ raise
+
+ errors[i] = e
- return normal_types, normal_columns
+ if errors:
+ raise InvalidValueForTypeListException(errors)
+
+ return new_normal_types, new_normal_columns
@@ -6,7 +6,7 @@
from csvkit import typeinference
-from csvkit.exceptions import InvalidValueForTypeException
+from csvkit.exceptions import InvalidValueForTypeException, InvalidValueForTypeListException
class TestNormalizeType(unittest.TestCase):
def test_nulls(self):
@@ -151,28 +151,68 @@ def test_datetimes_and_dates_coerce(self):
def test_datetimes_and_times(self):
self.assertEqual((unicode, ['Jan 1, 2008 at 4:40 AM', '2010-01-27T03:45:00', '16:14:45', None]), typeinference.normalize_column_type(['Jan 1, 2008 at 4:40 AM', '2010-01-27T03:45:00', '16:14:45', '']))
-
def test_normalize_table(self):
expected_types = [unicode, int, float, NoneType]
data = [
['a','1','2.1', ''],
- ['b', '5', '4.1', ''],
+ ['b', '5', '4.1'],
['c', '100', '100.9999', ''],
['d', '2', '5.3', '']
]
- column_count = len(expected_types)
- types, columns = typeinference.normalize_table(data, column_count)
+ types, columns = typeinference.normalize_table(data)
- self.assertEqual(column_count, len(types))
- self.assertEqual(column_count, len(columns))
+ self.assertEqual(4, len(types))
+ self.assertEqual(4, len(columns))
for i, tup in enumerate(zip(columns, types, expected_types)):
c, t, et = tup
self.assertEqual(et, t)
for row, normalized in zip(data, c):
if t is NoneType:
self.assertTrue(normalized is None)
- self.assertEqual('', row[i])
else:
self.assertEqual(t(row[i]), normalized)
+ def test_normalize_table_known_types(self):
+ normal_types = [unicode, int, float, NoneType]
+ data = [
+ ['a','1','2.1', ''],
+ ['b', '5', '4.1'],
+ ['c', '100', '100.9999', ''],
+ ['d', '2', '5.3', '']
+ ]
+ types, columns = typeinference.normalize_table(data, normal_types)
+
+ self.assertEqual(4, len(types))
+ self.assertEqual(4, len(columns))
+
+ for i, tup in enumerate(zip(columns, types, normal_types)):
+ c, t, et = tup
+ self.assertEqual(et, t)
+ for row, normalized in zip(data, c):
+ if t is NoneType:
+ self.assertTrue(normalized is None)
+ else:
+ self.assertEqual(t(row[i]), normalized)
+
+ def test_normalize_table_known_types_invalid(self):
+ normal_types = [bool, int, int, NoneType]
+ data = [
+ ['a','1','2.1', ''],
+ ['b', '5', '4.1'],
+ ['c', '100', '100.9999', ''],
+ ['d', '2', '5.3', '']
+ ]
+
+ try:
+ typeinference.normalize_table(data, normal_types, accumulate_errors=True)
+ self.assertEqual(True, False)
+ except InvalidValueForTypeListException, e:
+ self.assertEqual(len(e.errors), 2)
+ self.assertEqual(e.errors[0].index, 0)
+ self.assertEqual(e.errors[0].value, 'a')
+ self.assertEqual(e.errors[0].normal_type, bool)
+ self.assertEqual(e.errors[2].index, 0)
+ self.assertEqual(e.errors[2].value, '2.1')
+ self.assertEqual(e.errors[2].normal_type, int)
+

0 comments on commit 4b36724

Please sign in to comment.