Maybe fixed

dask · Aug 17, 2023 · f653346 · f653346
1 parent a5138a0
commit f653346
Show file tree

Hide file tree

Showing 11 changed files with 67 additions and 32 deletions.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -23,3 +23,9 @@ API
 .. autofunction:: write
 
 .. autofunction:: update_file_custom_metadata
+
+
+.. raw:: html
+
+    <script data-goatcounter="https://distdatacats.goatcounter.com/count"
+        async src="//gc.zgo.at/count.js"></script>
diff --git a/docs/source/details.rst b/docs/source/details.rst
@@ -256,3 +256,9 @@ Dask and Pandas fully support calling ``fastparquet`` directly, with the functio
 Please see their relevant docstrings. Remote filesystems are supported by using
 a URL with a "protocol://" specifier and any ``storage_options`` to be passed to
 the file system implementation.
+
+
+.. raw:: html
+
+    <script data-goatcounter="https://distdatacats.goatcounter.com/count"
+        async src="//gc.zgo.at/count.js"></script>
diff --git a/docs/source/filesystems.rst b/docs/source/filesystems.rst
@@ -32,3 +32,9 @@ Similarly, providing an open function and another to make any necessary director
          row_group_offsets=[0, 500], open_with=myopen, mkdirs=noop)
 
 (In the case of s3, no intermediate directories need to be created)
+
+
+.. raw:: html
+
+    <script data-goatcounter="https://distdatacats.goatcounter.com/count"
+        async src="//gc.zgo.at/count.js"></script>
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -97,3 +97,9 @@ Index
 1. :ref:`genindex`
 1. :ref:`modindex`
 1. :ref:`search`
+
+
+.. raw:: html
+
+    <script data-goatcounter="https://distdatacats.goatcounter.com/count"
+        async src="//gc.zgo.at/count.js"></script>
diff --git a/docs/source/install.rst b/docs/source/install.rst
@@ -71,3 +71,9 @@ This will produce a ``build/html/`` subdirectory, where the entry point is
 ``index.html``.
 
 
+
+
+.. raw:: html
+
+    <script data-goatcounter="https://distdatacats.goatcounter.com/count"
+        async src="//gc.zgo.at/count.js"></script>
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -87,3 +87,9 @@ Further options that may be of interest are:
     write('outdir.parq', df, row_group_offsets=[0, 10000, 20000],
           compression='GZIP', file_scheme='hive')
 
+
+
+.. raw:: html
+
+    <script data-goatcounter="https://distdatacats.goatcounter.com/count"
+        async src="//gc.zgo.at/count.js"></script>
diff --git a/docs/source/releasenotes.rst b/docs/source/releasenotes.rst
@@ -188,3 +188,9 @@ NB: minor versions up to 0.6.3 fix build issues
    consolidate of many data files.
 
 .. _cramjam: https://github.com/milesgranger/pyrus-cramjam
+
+
+.. raw:: html
+
+    <script data-goatcounter="https://distdatacats.goatcounter.com/count"
+        async src="//gc.zgo.at/count.js"></script>
diff --git a/fastparquet/converted_types.py b/fastparquet/converted_types.py
@@ -56,11 +56,11 @@ def tobson(x):
     parquet_thrift.ConvertedType.INT_16: np.dtype("int16"),
     parquet_thrift.ConvertedType.INT_32: np.dtype('int32'),
     parquet_thrift.ConvertedType.INT_64: np.dtype('int64'),
-    parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'),
+    parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ms]'),
     parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'),
-    parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'),
-    parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'),
-    parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]')
+    parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ms]'),
+    parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[us]'),
+    parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[us]')
 }
 nullable = {
     np.dtype('int8'): pd.Int8Dtype(),
@@ -115,6 +115,8 @@ def typemap(se, md=None):
             return simple[se.type]
         else:
             return np.dtype("S%i" % se.type_length)
+    if md and "time" in md.get("numpy_type", ""):
+        return np.dtype(md["numpy_type"])
     if se.converted_type in complex:
         return complex[se.converted_type]
     return np.dtype("O")
@@ -145,7 +147,7 @@ def converts_inplace(se):
     return False
 
 
-def convert(data, se, timestamp96=True):
+def convert(data, se, timestamp96=True, dtype=None):
     """Convert known types from primitive to rich.
 
     Parameters
@@ -157,6 +159,7 @@ def convert(data, se, timestamp96=True):
     ctype = se.converted_type
     if se.type == parquet_thrift.Type.INT96 and timestamp96:
         data2 = data.view([('ns', 'i8'), ('day', 'i4')])
+        # TODO: this should be ms unit, now that we can?
         return ((data2['day'] - 2440588) * 86400000000000 +
                 data2['ns']).view('M8[ns]')
     if se.logicalType is not None and se.logicalType.TIMESTAMP is not None:
@@ -179,6 +182,7 @@ def convert(data, se, timestamp96=True):
             # NB: general but slow method
             # could optimize when data.dtype.itemsize <= 8
             # TODO: easy cythonize (but rare)
+            # TODO: extension point for pandas-decimal (no conversion needed)
             return np.array([
                 int.from_bytes(
                     data.data[i:i + 1], byteorder='big', signed=True
@@ -189,21 +193,16 @@ def convert(data, se, timestamp96=True):
         data = data * DAYS_TO_MILLIS
         return data.view('datetime64[ns]')
     elif ctype == parquet_thrift.ConvertedType.TIME_MILLIS:
-        out = data.astype('int64', copy=False)
-        time_shift(out.view("int64"), 1000000)
-        return out.view('timedelta64[ns]')
+        # this was not covered by new pandas time units
+        data = data.astype('int64', copy=False)
+        time_shift(data, 1000000)
+        return data.view('timedelta64[ns]')
     elif ctype == parquet_thrift.ConvertedType.TIMESTAMP_MILLIS:
-        out = data
-        time_shift(data.view("int64"), 1000000)
-        return out.view('datetime64[ns]')
+        return data.view('datetime64[ms]')
     elif ctype == parquet_thrift.ConvertedType.TIME_MICROS:
-        out = data
-        time_shift(data.view("int64"))
-        return out.view('timedelta64[ns]')
+        return data.view('timedelta64[us]')
     elif ctype == parquet_thrift.ConvertedType.TIMESTAMP_MICROS:
-        out = data
-        time_shift(data.view("int64"))
-        return out.view('datetime64[ns]')
+        return data.view('datetime64[us]')
     elif ctype == parquet_thrift.ConvertedType.UINT_8:
         # TODO: return strided views?
         #  data.view('uint8')[::data.itemsize].view(out_dtype)

diff --git a/fastparquet/core.py b/fastparquet/core.py
@@ -548,7 +548,7 @@ def read_col(column, schema_helper, infile, use_cat=False,
             if d and not use_cat:
                 part[defi == max_defi] = dic[val]
             elif not use_cat:
-                part[defi == max_defi] = convert(val, se)
+                part[defi == max_defi] = convert(val, se, dtype=assign.dtype)
             else:
                 part[defi == max_defi] = val
         else:
@@ -557,7 +557,7 @@ def read_col(column, schema_helper, infile, use_cat=False,
                 piece = piece._data
             if use_cat and not d:
                 # only possible for multi-index
-                val = convert(val, se)
+                val = convert(val, se, dtype=assign.dtype)
                 try:
                     i = pd.Categorical(val)
                 except:
@@ -567,7 +567,7 @@ def read_col(column, schema_helper, infile, use_cat=False,
             elif d and not use_cat:
                 piece[:] = dic[val]
             elif not use_cat:
-                piece[:] = convert(val, se)
+                piece[:] = convert(val, se, dtype=assign.dtype)
             else:
                 piece[:] = val
 

diff --git a/fastparquet/writer.py b/fastparquet/writer.py
@@ -303,8 +303,12 @@ def convert(data, se):
 
     elif converted_type == parquet_thrift.ConvertedType.TIME_MICROS:
         # TODO: shift inplace
-        out = np.empty(len(data), 'int64')
-        time_shift(data.values.view('int64'), out)
+        if data.dtype == "m8[ns]":
+            out = np.empty(len(data), 'int64')
+            time_shift(data.values.view('int64'), out)
+        else:
+            # assuming ms or us
+            out = data.values
     elif type == parquet_thrift.Type.INT96 and dtype.kind == 'M':
         ns_per_day = (24 * 3600 * 1000000000)
         day = data.values.view('int64') // ns_per_day + 2440588

diff --git a/readthedocs.yml b/readthedocs.yml