Adds int32 support for object encoding (#268)

* Adds int32 support for object encoding * Add test for int32 object encoding
dask · Jan 10, 2018 · 4507685 · 4507685
1 parent c2e6a39
commit 4507685
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 3 deletions.
diff --git a/fastparquet/test/test_output.py b/fastparquet/test/test_output.py
@@ -891,3 +891,11 @@ def test_bad_object_encoding(tempdir):
     assert "INT64" in str(e)
     assert "primitive" in str(e)
     assert '"a"' in str(e)
+
+def test_object_encoding_int32(tempdir):
+    df = pd.DataFrame({'a': ['15', None, '2']})
+    fn = os.path.join(tempdir, 'temp.parq')
+    write(fn, df, object_encoding={'a': 'int32'})
+    pf = ParquetFile(fn)
+    assert pf._schema[1].type == parquet_thrift.Type.INT32
+    assert not pf.schema.is_required('a')
diff --git a/fastparquet/writer.py b/fastparquet/writer.py
@@ -67,7 +67,7 @@ def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
     fixed_text: int or None
         For str and bytes, the fixed-string length to use. If None, object
         column will remain variable length.
-    object_encoding: None or infer|bytes|utf8|json|bson|bool|int|float
+    object_encoding: None or infer|bytes|utf8|json|bson|bool|int|int32|float
         How to encode object type into bytes. If None, bytes is assumed;
         if 'infer', type is guessed from 10 first non-null values.
     times: 'int64'|'int96'
@@ -111,12 +111,15 @@ def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
         elif object_encoding == 'int':
             type, converted_type, width = (parquet_thrift.Type.INT64, None,
                                            64)
+        elif object_encoding == 'int32':
+            type, converted_type, width = (parquet_thrift.Type.INT32, None,
+                                           32)
         elif object_encoding == 'float':
             type, converted_type, width = (parquet_thrift.Type.DOUBLE, None,
                                            64)
         else:
             raise ValueError('Object encoding (%s) not one of '
-                             'infer|utf8|bytes|json|bson|bool|int|float' %
+                             'infer|utf8|bytes|json|bson|bool|int|int32|float' %
                              object_encoding)
         if fixed_text:
             width = fixed_text
@@ -762,7 +765,7 @@ def write(filename, data, row_group_offsets=50000000,
         and the schema must match the input data.
     object_encoding: str or {col: type}
         For object columns, this gives the data type, so that the values can
-        be encoded to bytes. Possible values are bytes|utf8|json|bson|bool|int,
+        be encoded to bytes. Possible values are bytes|utf8|json|bson|bool|int|int32,
         where bytes is assumed if not specified (i.e., no conversion). The
         special value 'infer' will cause the type to be guessed from the first
         ten non-null values.