Skip to content

Commit

Permalink
Merge pull request #56 from martindurant/faster_encoding
Browse files Browse the repository at this point in the history
Don't use pandas' .str.encode()
  • Loading branch information
martindurant committed Dec 22, 2016
2 parents 23ff89c + f9ee53b commit 3a57c83
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 14 deletions.
8 changes: 8 additions & 0 deletions fastparquet/converted_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,14 @@ def convert(data, se):
return data.astype(np.uint32)
elif ctype == parquet_thrift.ConvertedType.UINT_64:
return data.astype(np.uint64)
elif ctype == parquet_thrift.ConvertedType.INT_8:
return data.astype(np.int8)
elif ctype == parquet_thrift.ConvertedType.INT_16:
return data.astype(np.int16)
elif ctype == parquet_thrift.ConvertedType.INT_32:
return data.astype(np.int32)
elif ctype == parquet_thrift.ConvertedType.INT_64:
return data.astype(np.int64)
elif ctype == parquet_thrift.ConvertedType.JSON:
if isinstance(data, list) or data.dtype != "O":
out = np.empty(len(data), dtype="O")
Expand Down
18 changes: 9 additions & 9 deletions fastparquet/test/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,33 +411,33 @@ def test_naive_index(tempdir):


def test_text_convert(tempdir):
df = pd.DataFrame({'a': ['a'] * 100,
df = pd.DataFrame({'a': ['π'] * 100,
'b': [b'a'] * 100})
fn = os.path.join(tempdir, 'tmp.parq')

write(fn, df, fixed_text={'a': 1, 'b': 2})
write(fn, df, fixed_text={'a': 2, 'b': 1})
pf = ParquetFile(fn)
assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
assert pf.schema[1].type_length == 1
assert pf.schema[1].type_length == 2
assert pf.schema[2].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
assert pf.schema[2].type_length == 2
assert pf.statistics['max']['a'] == ['a']
assert pf.schema[2].type_length == 1
assert pf.statistics['max']['a'] == ['π']
df2 = pf.to_pandas()
tm.assert_frame_equal(df, df2, check_categorical=False)

write(fn, df)
pf = ParquetFile(fn)
assert pf.schema[1].type == parquet_thrift.Type.BYTE_ARRAY
assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
assert pf.statistics['max']['a'] == ['a']
assert pf.statistics['max']['a'] == ['π']
df2 = pf.to_pandas()
tm.assert_frame_equal(df, df2, check_categorical=False)

write(fn, df, fixed_text={'a': 1})
write(fn, df, fixed_text={'a': 2})
pf = ParquetFile(fn)
assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
assert pf.statistics['max']['a'] == ['a']
assert pf.statistics['max']['a'] == ['π']
df2 = pf.to_pandas()
tm.assert_frame_equal(df, df2, check_categorical=False)

Expand Down Expand Up @@ -472,7 +472,7 @@ def test_auto_null(tempdir):
df['e'] = df['d'].astype('category')
fn = os.path.join(tmp, "test.parq")

with pytest.raises(TypeError):
with pytest.raises((TypeError, AttributeError)):
## TODO: this should be a nicer error?
write(fn, df, has_nulls=False)

Expand Down
11 changes: 6 additions & 5 deletions fastparquet/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,12 @@ def find_type(data, fixed_text=None, object_encoding=None):
if fixed_text:
width = fixed_text
type = parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
elif str(dtype).startswith("datetime64"):
elif dtype.kind == "M":
type, converted_type, width = (parquet_thrift.Type.INT64,
parquet_thrift.ConvertedType.TIMESTAMP_MICROS, None)
if hasattr(dtype, 'tz') and str(dtype.tz) != 'UTC':
warnings.warn('Coercing datetimes to UTC')
elif str(dtype).startswith("timedelta64"):
elif dtype.kind == "m":
type, converted_type, width = (parquet_thrift.Type.INT64,
parquet_thrift.ConvertedType.TIME_MICROS, None)
else:
Expand Down Expand Up @@ -133,15 +133,16 @@ def convert(data, se):
out = data.values
elif dtype == "O":
if converted_type == parquet_thrift.ConvertedType.UTF8:
out = data.str.encode('utf8').values
out = np.array([x.encode('utf8') for x in data], dtype="O")
elif converted_type is None:
out = data.values
elif converted_type == parquet_thrift.ConvertedType.JSON:
out = data.map(json.dumps).str.encode('utf8').values
out = np.array([json.dumps(x).encode('utf8') for x in data],
dtype="O")
elif converted_type == parquet_thrift.ConvertedType.BSON:
out = data.map(tobson).values
if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY:
out = data.values.astype('S%i' % se.type_length)
out = out.astype('S%i' % se.type_length)
elif converted_type == parquet_thrift.ConvertedType.TIMESTAMP_MICROS:
out = np.empty(len(data), 'int64')
time_shift(data.values.view('int64'), out)
Expand Down

0 comments on commit 3a57c83

Please sign in to comment.