Skip to content

Commit

Permalink
If timezones are in pandas metadata, assign columns as required (#285)
Browse files Browse the repository at this point in the history
* If timezones are in pandas metadata, assign columns as required

Fixes #257

* fixes
  • Loading branch information
martindurant committed Jan 23, 2018
1 parent 96b8b0e commit bbf6d68
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 13 deletions.
11 changes: 8 additions & 3 deletions fastparquet/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,13 @@ def to_pandas(self, columns=None, categories=None, filters=[],
def pre_allocate(self, size, columns, categories, index):
if categories is None:
categories = self.categories
tz = None
if 'pandas' in self.key_value_metadata:
md = json.loads(self.key_value_metadata['pandas'])['columns']
tz = {c['name']: c['metadata']['timezone'] for c in md
if (c.get('metadata', {}) or {}).get('timezone', None)}
return _pre_allocate(size, columns, categories, index, self.cats,
self._dtypes(categories))
self._dtypes(categories), tz)

@property
def count(self):
Expand Down Expand Up @@ -484,7 +489,7 @@ def __str__(self):
__repr__ = __str__


def _pre_allocate(size, columns, categories, index, cs, dt):
def _pre_allocate(size, columns, categories, index, cs, dt, tz=None):
cols = [c for c in columns if index != c]
categories = categories or {}
cats = cs.copy()
Expand All @@ -501,7 +506,7 @@ def get_type(name):
cols.extend(cs)
dtypes.extend(['category'] * len(cs))
df, views = dataframe.empty(dtypes, size, cols=cols, index_name=index,
index_type=index_type, cats=cats)
index_type=index_type, cats=cats, timezones=tz)
if index and re.match(r'__index_level_\d+__', index):
df.index.name = None
return df, views
Expand Down
29 changes: 21 additions & 8 deletions fastparquet/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import numpy as np
from pandas.core.index import _ensure_index, CategoricalIndex, Index
from pandas.core.internals import BlockManager, _block_shape
from pandas import Categorical
from pandas.core.frame import DataFrame
from pandas.core.index import RangeIndex, Index
from pandas.core.index import CategoricalIndex, RangeIndex, Index
from pandas.core.internals import BlockManager
from pandas import Categorical, DataFrame
from pandas.api.types import is_categorical_dtype
from .util import STR_TYPE


def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
def empty(types, size, cats=None, cols=None, index_type=None, index_name=None,
timezones=None):
"""
Create empty DataFrame to assign into
Expand All @@ -27,6 +26,9 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
is missing, will assume 16-bit integers (a reasonable default).
cols: list of labels
assigned column names, including categorical ones.
timezones: dict {col: timezone_str}
for timestamp type columns, apply this timezone to the pandas series;
the numpy view will be UTC.
Returns
-------
Expand All @@ -36,6 +38,7 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
"""
df = DataFrame()
views = {}
timezones = timezones or {}

cols = cols if cols is not None else range(cols)
if isinstance(types, STR_TYPE):
Expand All @@ -55,6 +58,8 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
fastpath=True)
else:
df[str(col)] = np.empty(0, dtype=t)
if df[str(col)].dtype.kind == "M" and str(col) in timezones:
df[str(col)] = df[str(col)].dt.tz_localize(timezones[str(col)])

if index_type is not None and index_type is not False:
if index_name is None:
Expand All @@ -70,7 +75,7 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
fastpath=True)
else: # explicit labels list
c = Categorical([], categories=cats[index_name],
fastpath=True)
fastpath=True)
print(cats, index_name, c)
vals = np.empty(size, dtype=c.codes.dtype)
index = CategoricalIndex(c)
Expand All @@ -92,11 +97,17 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
code = np.zeros(shape=size, dtype=block.values.codes.dtype)
values = Categorical(values=code, categories=categories,
fastpath=True)
new_block = block.make_block_same_class(values=values)
elif getattr(block.dtype, 'tz', None):
new_shape = (size, )
values = np.empty(shape=new_shape, dtype=block.values.values.dtype)
new_block = block.make_block_same_class(
values=values, dtype=block.values.dtype)
else:
new_shape = (block.values.shape[0], size)
values = np.empty(shape=new_shape, dtype=block.values.dtype)
new_block = block.make_block_same_class(values=values)

new_block = block.make_block_same_class(values=values)
blocks.append(new_block)

# create block manager
Expand All @@ -113,6 +124,8 @@ def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
if is_categorical_dtype(dtype):
views[col] = block.values._codes
views[col+'-catdef'] = block.values
elif getattr(block.dtype, 'tz', None):
views[col] = block.values.values
else:
views[col] = block.values[i]

Expand Down
47 changes: 47 additions & 0 deletions fastparquet/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,50 @@ def test_empty():
cols=['i4', 'i8', 'f8_1', 'f8_2', 'O'])
assert df.shape == (n, 5)
assert len(views) == 5


def test_timestamps():
z = 'US/Eastern'

# single column
df, views = empty('M8', 100, cols=['t'])
assert df.t.dt.tz is None
views['t'].dtype.kind == "M"

df, views = empty('M8', 100, cols=['t'], timezones={'t': z})
assert df.t.dt.tz.zone == z
views['t'].dtype.kind == "M"

# one time column, one normal
df, views = empty('M8,i', 100, cols=['t', 'i'], timezones={'t': z})
assert df.t.dt.tz.zone == z
views['t'].dtype.kind == "M"
views['i'].dtype.kind == 'i'

# no effect of timezones= on non-time column
df, views = empty('M8,i', 100, cols=['t', 'i'], timezones={'t': z, 'i': z})
assert df.t.dt.tz.zone == z
assert df.i.dtype.kind == 'i'
views['t'].dtype.kind == "M"
views['i'].dtype.kind == 'i'

# multi-timezones
z2 = 'US/Central'
df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
't2': z})
assert df.t1.dt.tz.zone == z
assert df.t2.dt.tz.zone == z

df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z})
assert df.t1.dt.tz.zone == z
assert df.t2.dt.tz is None

df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
't2': 'UTC'})
assert df.t1.dt.tz.zone == z
assert df.t2.dt.tz.zone == 'UTC'

df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
't2': z2})
assert df.t1.dt.tz.zone == z
assert df.t2.dt.tz.zone == z2
2 changes: 0 additions & 2 deletions fastparquet/test/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,6 @@ def test_datetime_roundtrip(tempdir, df, capsys):
assert "UTC" in str(w.list[0].message)

df2 = r.to_pandas()
if 'x' in df:
df['x'] = df.x.dt.tz_convert(None)

pd.util.testing.assert_frame_equal(df, df2, check_categorical=False)

Expand Down

0 comments on commit bbf6d68

Please sign in to comment.