Skip to content

Commit

Permalink
Merge pull request #208 from martindurant/object_convert_error
Browse files Browse the repository at this point in the history
Better error messages when failed to convert on write
  • Loading branch information
martindurant committed Sep 5, 2017
2 parents 1067adc + 4d7e265 commit a1f88f4
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 24 deletions.
19 changes: 17 additions & 2 deletions fastparquet/test/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,8 +518,7 @@ def test_auto_null(tempdir):
test_cols = list(set(df) - set(object_cols)) + ['d']
fn = os.path.join(tmp, "test.parq")

with pytest.raises((TypeError, AttributeError)):
## TODO: this should be a nicer error?
with pytest.raises(ValueError):
write(fn, df, has_nulls=False)

write(fn, df, has_nulls=True)
Expand Down Expand Up @@ -860,3 +859,19 @@ def test_consolidate_cats(tempdir):
writer.consolidate_categories(pf.fmd)
assert 5 == json.loads(pf.fmd.key_value_metadata[0].value)['columns'][0][
'metadata']['num_categories']


def test_bad_object_encoding(tempdir):
df = pd.DataFrame({'a': [b'00']})
with pytest.raises(ValueError) as e:
write(tempdir, df, file_scheme='hive', object_encoding='utf8')
assert "UTF8" in str(e)
assert "bytes" in str(e)
assert '"a"' in str(e)

df = pd.DataFrame({'a': [0, "hello", 0]})
with pytest.raises(ValueError) as e:
write(tempdir, df, file_scheme='hive', object_encoding='int')
assert "INT64" in str(e)
assert "primitive" in str(e)
assert '"a"' in str(e)
58 changes: 36 additions & 22 deletions fastparquet/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,24 +163,31 @@ def convert(data, se):
elif "S" in str(dtype)[:2] or "U" in str(dtype)[:2]:
out = data.values
elif dtype == "O":
if converted_type == parquet_thrift.ConvertedType.UTF8:
out = array_encode_utf8(data)
elif converted_type is None:
if type in revmap:
out = data.values.astype(revmap[type], copy=False)
elif type == parquet_thrift.Type.BOOLEAN:
padded = np.lib.pad(data.values, (0, 8 - (len(data) % 8)),
'constant', constant_values=(0, 0))
out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel())
else:
out = data.values
elif converted_type == parquet_thrift.ConvertedType.JSON:
out = np.array([json.dumps(x).encode('utf8') for x in data],
dtype="O")
elif converted_type == parquet_thrift.ConvertedType.BSON:
out = data.map(tobson).values
if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY:
out = out.astype('S%i' % se.type_length)
try:
if converted_type == parquet_thrift.ConvertedType.UTF8:
out = array_encode_utf8(data)
elif converted_type is None:
if type in revmap:
out = data.values.astype(revmap[type], copy=False)
elif type == parquet_thrift.Type.BOOLEAN:
padded = np.lib.pad(data.values, (0, 8 - (len(data) % 8)),
'constant', constant_values=(0, 0))
out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel())
else:
out = data.values
elif converted_type == parquet_thrift.ConvertedType.JSON:
out = np.array([json.dumps(x).encode('utf8') for x in data],
dtype="O")
elif converted_type == parquet_thrift.ConvertedType.BSON:
out = data.map(tobson).values
if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY:
out = out.astype('S%i' % se.type_length)
except Exception as e:
ct = parquet_thrift.ConvertedType._VALUES_TO_NAMES[
converted_type] if converted_type is not None else None
raise ValueError('Error converting column "%s" to bytes using '
'encoding %s. Original error: '
'%s' % (data.name, ct, e))
elif converted_type == parquet_thrift.ConvertedType.TIMESTAMP_MICROS:
out = np.empty(len(data), 'int64')
time_shift(data.values.view('int64'), out)
Expand Down Expand Up @@ -425,10 +432,17 @@ def write_column(f, data, selement, compression=None):
num_nulls = len(data) - data.count()
definition_data, data = make_definitions(data, num_nulls == 0)
if data.dtype.kind == "O" and not is_categorical_dtype(data.dtype):
if selement.type == parquet_thrift.Type.INT64:
data = data.astype(int)
elif selement.type == parquet_thrift.Type.BOOLEAN:
data = data.astype(bool)
try:
if selement.type == parquet_thrift.Type.INT64:
data = data.astype(int)
elif selement.type == parquet_thrift.Type.BOOLEAN:
data = data.astype(bool)
except ValueError as e:
t = parquet_thrift.Type._VALUES_TO_NAMES[selement.type]
raise ValueError('Error converting column "%s" to primitive '
'type %s. Original error: '
'%s' % (data.name, t, e))

else:
definition_data = b""
num_nulls = 0
Expand Down

0 comments on commit a1f88f4

Please sign in to comment.