Merge pull request #56 from martindurant/faster_encoding

Don't use pandas' .str.encode()
dask · Dec 22, 2016 · 3a57c83 · 3a57c83
2 parents 23ff89c + f9ee53b
commit 3a57c83
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 14 deletions.
diff --git a/fastparquet/converted_types.py b/fastparquet/converted_types.py
@@ -137,6 +137,14 @@ def convert(data, se):
         return data.astype(np.uint32)
     elif ctype == parquet_thrift.ConvertedType.UINT_64:
         return data.astype(np.uint64)
+    elif ctype == parquet_thrift.ConvertedType.INT_8:
+        return data.astype(np.int8)
+    elif ctype == parquet_thrift.ConvertedType.INT_16:
+        return data.astype(np.int16)
+    elif ctype == parquet_thrift.ConvertedType.INT_32:
+        return data.astype(np.int32)
+    elif ctype == parquet_thrift.ConvertedType.INT_64:
+        return data.astype(np.int64)
     elif ctype == parquet_thrift.ConvertedType.JSON:
         if isinstance(data, list) or data.dtype != "O":
             out = np.empty(len(data), dtype="O")

diff --git a/fastparquet/test/test_output.py b/fastparquet/test/test_output.py
@@ -411,33 +411,33 @@ def test_naive_index(tempdir):
 
 
 def test_text_convert(tempdir):
-    df = pd.DataFrame({'a': ['a'] * 100,
+    df = pd.DataFrame({'a': ['π'] * 100,
                        'b': [b'a'] * 100})
     fn = os.path.join(tempdir, 'tmp.parq')
 
-    write(fn, df, fixed_text={'a': 1, 'b': 2})
+    write(fn, df, fixed_text={'a': 2, 'b': 1})
     pf = ParquetFile(fn)
     assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
-    assert pf.schema[1].type_length == 1
+    assert pf.schema[1].type_length == 2
     assert pf.schema[2].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
-    assert pf.schema[2].type_length == 2
-    assert pf.statistics['max']['a'] == ['a']
+    assert pf.schema[2].type_length == 1
+    assert pf.statistics['max']['a'] == ['π']
     df2 = pf.to_pandas()
     tm.assert_frame_equal(df, df2, check_categorical=False)
 
     write(fn, df)
     pf = ParquetFile(fn)
     assert pf.schema[1].type == parquet_thrift.Type.BYTE_ARRAY
     assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
-    assert pf.statistics['max']['a'] == ['a']
+    assert pf.statistics['max']['a'] == ['π']
     df2 = pf.to_pandas()
     tm.assert_frame_equal(df, df2, check_categorical=False)
 
-    write(fn, df, fixed_text={'a': 1})
+    write(fn, df, fixed_text={'a': 2})
     pf = ParquetFile(fn)
     assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
     assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
-    assert pf.statistics['max']['a'] == ['a']
+    assert pf.statistics['max']['a'] == ['π']
     df2 = pf.to_pandas()
     tm.assert_frame_equal(df, df2, check_categorical=False)
 
@@ -472,7 +472,7 @@ def test_auto_null(tempdir):
     df['e'] = df['d'].astype('category')
     fn = os.path.join(tmp, "test.parq")
 
-    with pytest.raises(TypeError):
+    with pytest.raises((TypeError, AttributeError)):
         ## TODO: this should be a nicer error?
         write(fn, df, has_nulls=False)
 

diff --git a/fastparquet/writer.py b/fastparquet/writer.py
@@ -98,12 +98,12 @@ def find_type(data, fixed_text=None, object_encoding=None):
         if fixed_text:
             width = fixed_text
             type = parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
-    elif str(dtype).startswith("datetime64"):
+    elif dtype.kind == "M":
         type, converted_type, width = (parquet_thrift.Type.INT64,
                                        parquet_thrift.ConvertedType.TIMESTAMP_MICROS, None)
         if hasattr(dtype, 'tz') and str(dtype.tz) != 'UTC':
             warnings.warn('Coercing datetimes to UTC')
-    elif str(dtype).startswith("timedelta64"):
+    elif dtype.kind == "m":
         type, converted_type, width = (parquet_thrift.Type.INT64,
                                        parquet_thrift.ConvertedType.TIME_MICROS, None)
     else:
@@ -133,15 +133,16 @@ def convert(data, se):
         out = data.values
     elif dtype == "O":
         if converted_type == parquet_thrift.ConvertedType.UTF8:
-            out = data.str.encode('utf8').values
+            out = np.array([x.encode('utf8') for x in data], dtype="O")
         elif converted_type is None:
             out = data.values
         elif converted_type == parquet_thrift.ConvertedType.JSON:
-            out = data.map(json.dumps).str.encode('utf8').values
+            out = np.array([json.dumps(x).encode('utf8') for x in data],
+                           dtype="O")
         elif converted_type == parquet_thrift.ConvertedType.BSON:
             out = data.map(tobson).values
         if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY:
-            out = data.values.astype('S%i' % se.type_length)
+            out = out.astype('S%i' % se.type_length)
     elif converted_type == parquet_thrift.ConvertedType.TIMESTAMP_MICROS:
         out = np.empty(len(data), 'int64')
         time_shift(data.values.view('int64'), out)