If timezones are in pandas metadata, assign columns as required (#285)

* If timezones are in pandas metadata, assign columns as required Fixes #257 * fixes
dask · Jan 23, 2018 · bbf6d68 · bbf6d68
1 parent 96b8b0e
commit bbf6d68
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 13 deletions.
diff --git a/fastparquet/api.py b/fastparquet/api.py
@@ -409,8 +409,13 @@ def to_pandas(self, columns=None, categories=None, filters=[],
     def pre_allocate(self, size, columns, categories, index):
         if categories is None:
             categories = self.categories
+        tz = None
+        if 'pandas' in self.key_value_metadata:
+            md = json.loads(self.key_value_metadata['pandas'])['columns']
+            tz = {c['name']: c['metadata']['timezone'] for c in md
+                  if (c.get('metadata', {}) or {}).get('timezone', None)}
         return _pre_allocate(size, columns, categories, index, self.cats,
-                             self._dtypes(categories))
+                             self._dtypes(categories), tz)
 
     @property
     def count(self):
@@ -484,7 +489,7 @@ def __str__(self):
     __repr__ = __str__
 
 
-def _pre_allocate(size, columns, categories, index, cs, dt):
+def _pre_allocate(size, columns, categories, index, cs, dt, tz=None):
     cols = [c for c in columns if index != c]
     categories = categories or {}
     cats = cs.copy()
@@ -501,7 +506,7 @@ def get_type(name):
     cols.extend(cs)
     dtypes.extend(['category'] * len(cs))
     df, views = dataframe.empty(dtypes, size, cols=cols, index_name=index,
-                                index_type=index_type, cats=cats)
+                                index_type=index_type, cats=cats, timezones=tz)
     if index and re.match(r'__index_level_\d+__', index):
         df.index.name = None
     return df, views

diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py
@@ -1,14 +1,13 @@
 import numpy as np
-from pandas.core.index import _ensure_index, CategoricalIndex, Index
-from pandas.core.internals import BlockManager, _block_shape
-from pandas import Categorical
-from pandas.core.frame import DataFrame
-from pandas.core.index import RangeIndex, Index
+from pandas.core.index import CategoricalIndex, RangeIndex, Index
+from pandas.core.internals import BlockManager
+from pandas import Categorical, DataFrame
 from pandas.api.types import is_categorical_dtype
 from .util import STR_TYPE
 
 
-def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
+def empty(types, size, cats=None, cols=None, index_type=None, index_name=None,
+          timezones=None):
     """
     Create empty DataFrame to assign into
 
@@ -27,6 +26,9 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
         is missing, will assume 16-bit integers (a reasonable default).
     cols: list of labels
         assigned column names, including categorical ones.
+    timezones: dict {col: timezone_str}
+        for timestamp type columns, apply this timezone to the pandas series;
+        the numpy view will be UTC.
 
     Returns
     -------
@@ -36,6 +38,7 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
     """
     df = DataFrame()
     views = {}
+    timezones = timezones or {}
 
     cols = cols if cols is not None else range(cols)
     if isinstance(types, STR_TYPE):
@@ -55,6 +58,8 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
                                            fastpath=True)
         else:
             df[str(col)] = np.empty(0, dtype=t)
+            if df[str(col)].dtype.kind == "M" and str(col) in timezones:
+                df[str(col)] = df[str(col)].dt.tz_localize(timezones[str(col)])
 
     if index_type is not None and index_type is not False:
         if index_name is None:
@@ -70,7 +75,7 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
                         fastpath=True)
             else:  # explicit labels list
                 c = Categorical([], categories=cats[index_name],
-                                           fastpath=True)
+                                fastpath=True)
             print(cats, index_name, c)
             vals = np.empty(size, dtype=c.codes.dtype)
             index = CategoricalIndex(c)
@@ -92,11 +97,17 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
             code = np.zeros(shape=size, dtype=block.values.codes.dtype)
             values = Categorical(values=code, categories=categories,
                                  fastpath=True)
+            new_block = block.make_block_same_class(values=values)
+        elif getattr(block.dtype, 'tz', None):
+            new_shape = (size, )
+            values = np.empty(shape=new_shape, dtype=block.values.values.dtype)
+            new_block = block.make_block_same_class(
+                    values=values, dtype=block.values.dtype)
         else:
             new_shape = (block.values.shape[0], size)
             values = np.empty(shape=new_shape, dtype=block.values.dtype)
+            new_block = block.make_block_same_class(values=values)
 
-        new_block = block.make_block_same_class(values=values)
         blocks.append(new_block)
 
     # create block manager
@@ -113,6 +124,8 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
             if is_categorical_dtype(dtype):
                 views[col] = block.values._codes
                 views[col+'-catdef'] = block.values
+            elif getattr(block.dtype, 'tz', None):
+                views[col] = block.values.values
             else:
                 views[col] = block.values[i]
 

diff --git a/fastparquet/test/test_dataframe.py b/fastparquet/test/test_dataframe.py
@@ -23,3 +23,50 @@ def test_empty():
                       cols=['i4', 'i8', 'f8_1', 'f8_2', 'O'])
     assert df.shape == (n, 5)
     assert len(views) == 5
+
+
+def test_timestamps():
+    z = 'US/Eastern'
+
+    # single column
+    df, views = empty('M8', 100, cols=['t'])
+    assert df.t.dt.tz is None
+    views['t'].dtype.kind == "M"
+
+    df, views = empty('M8', 100, cols=['t'], timezones={'t': z})
+    assert df.t.dt.tz.zone == z
+    views['t'].dtype.kind == "M"
+
+    # one time column, one normal
+    df, views = empty('M8,i', 100, cols=['t', 'i'], timezones={'t': z})
+    assert df.t.dt.tz.zone == z
+    views['t'].dtype.kind == "M"
+    views['i'].dtype.kind == 'i'
+
+    # no effect of timezones= on non-time column
+    df, views = empty('M8,i', 100, cols=['t', 'i'], timezones={'t': z, 'i': z})
+    assert df.t.dt.tz.zone == z
+    assert df.i.dtype.kind == 'i'
+    views['t'].dtype.kind == "M"
+    views['i'].dtype.kind == 'i'
+
+    # multi-timezones
+    z2 = 'US/Central'
+    df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
+                                                                  't2': z})
+    assert df.t1.dt.tz.zone == z
+    assert df.t2.dt.tz.zone == z
+
+    df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z})
+    assert df.t1.dt.tz.zone == z
+    assert df.t2.dt.tz is None
+
+    df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
+                                                                  't2': 'UTC'})
+    assert df.t1.dt.tz.zone == z
+    assert df.t2.dt.tz.zone == 'UTC'
+
+    df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
+                                                                  't2': z2})
+    assert df.t1.dt.tz.zone == z
+    assert df.t2.dt.tz.zone == z2
diff --git a/fastparquet/test/test_output.py b/fastparquet/test/test_output.py
@@ -187,8 +187,6 @@ def test_datetime_roundtrip(tempdir, df, capsys):
         assert "UTC" in str(w.list[0].message)
 
     df2 = r.to_pandas()
-    if 'x' in df:
-        df['x'] = df.x.dt.tz_convert(None)
 
     pd.util.testing.assert_frame_equal(df, df2, check_categorical=False)