Skip to content

Commit

Permalink
Fix datetime issue (#58)
Browse files Browse the repository at this point in the history
* Schema overwrite, and fix actions

* update

* update

* add one more test

* fix action

* fix datetime type handling in pyarrow and pandas

* update

* update
  • Loading branch information
goodwanghan committed Nov 6, 2020
1 parent c0cd69f commit be558f9
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 17 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ pip install triad

## Release History

### 0.4.5
* Fixed pandas -> arrow datetime conversion issue

### 0.4.4
* Improved FileSystem compatibility with Windows
* Add overwrite expression for Schema class
Expand Down
9 changes: 8 additions & 1 deletion tests/collections/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import math
from collections import OrderedDict
from datetime import datetime
from datetime import datetime, date

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -33,6 +33,13 @@ def test_schema_init():
assert 0 == len(Schema(""))


def test_schema_datetime():
df = pd.DataFrame(
[[datetime(2020, 1, 1, 2, 3, 4, 5), date(2020, 2, 2)]], columns=["a", "b"]
)
assert Schema(df) == "a:datetime,b:date"


def test_schema_properties():
s = Schema("a:int,b:str")
assert ["a", "b"] == s.names
Expand Down
33 changes: 32 additions & 1 deletion tests/utils/test_pandas_like.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,20 @@ def test_to_schema():
df = df.astype(dtype={"x": np.int32, "y": np.dtype("str")})
assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))

# timestamp test
df = pd.DataFrame(
[[datetime(2020, 1, 1, 2, 3, 4, 5), date(2020, 2, 2)]], columns=["a", "b"]
)
assert list(expression_to_schema("a:datetime,b:date")) == list(
PD_UTILS.to_schema(df)
)

# test index
df = pd.DataFrame([[3.0, 2], [2.0, 3]], columns=["x", "y"])
df = df.sort_values(["x"])
assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))
assert list(pa.Schema.from_pandas(df, preserve_index=False)) == list(
PD_UTILS.to_schema(df)
)
df.index.name = "x"
raises(ValueError, lambda: PD_UTILS.to_schema(df))
df = df.reset_index(drop=True)
Expand Down Expand Up @@ -86,6 +96,27 @@ def test_as_array_iterable():
assert isinstance(df.as_array()[0][1], int)


def test_as_array_iterable_datetime():
df = pd.DataFrame(
[[datetime(2020, 1, 1, 2, 3, 4, 5), date(2020, 2, 2)]], columns=["a", "b"]
)
v1 = list(PD_UTILS.as_array_iterable(df, type_safe=True))[0]
v2 = list(
PD_UTILS.as_array_iterable(
df, schema=expression_to_schema("a:datetime,b:date"), type_safe=True
)
)[0]
assert v1[0] == v2[0]
assert not isinstance(v1[0], pd.Timestamp)
assert type(v1[0]) == datetime
assert type(v1[0]) == type(v2[0])

assert v1[1] == v2[1]
assert not isinstance(v1[1], pd.Timestamp)
assert type(v1[1]) == date
assert type(v1[1]) == type(v2[1])


def test_nested():
# data = [[dict(b=[30, "40"])]]
# s = expression_to_schema("a:{a:str,b:[int]}")
Expand Down
7 changes: 5 additions & 2 deletions tests/utils/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,10 @@ def test__type_to_expression():
assert "timestamp(ns,America/New_York)" == _type_to_expression(
pa.timestamp("ns", "America/New_York")
)
assert "timestamp(s)" == _type_to_expression(pa.timestamp("s"))
assert "datetime" == _type_to_expression(pa.timestamp("s"))
assert "datetime" == _type_to_expression(pa.timestamp("ns"))
assert "datetime" == _type_to_expression(pa.timestamp("ms"))
assert "datetime" == _type_to_expression(pa.timestamp("us"))
assert "decimal(5)" == _type_to_expression(pa.decimal128(5))
assert "decimal(5,2)" == _type_to_expression(pa.decimal128(5, 2))
assert "bytes" == _type_to_expression(pa.binary())
Expand Down Expand Up @@ -153,7 +156,7 @@ def test_get_eq_func():
for n in [None, float("nan"), float("inf"), float("-inf")]:
assert not get_eq_func(t)(None, 1.1)
assert get_eq_func(t)(None, None)
for t in [pa.timestamp("ns")]:
for t in [pa.timestamp("ns"), pa.timestamp("us")]:
for n in [None, pd.NaT]:
assert not get_eq_func(t)(datetime(2020, 1, 1, 0), datetime(2020, 1, 1, 1))
assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1))
Expand Down
34 changes: 23 additions & 11 deletions triad/utils/pandas_like.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from triad.utils.pyarrow import apply_schema, to_pandas_dtype
from triad.utils.assertion import assert_or_throw
from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP, apply_schema, to_pandas_dtype

T = TypeVar("T", bound=Any)
_DEFAULT_JOIN_KEYS: List[str] = []
Expand Down Expand Up @@ -94,18 +95,29 @@ def to_schema(self, df: T) -> pa.Schema:
or pd.UInt64Index and without a name, otherwise, `ValueError` will raise.
"""
self.ensure_compatible(df)
if df.columns.dtype != "object":
raise ValueError("Pandas dataframe must have named schema")
if isinstance(df, pd.DataFrame) and len(df.index) > 0:
return pa.Schema.from_pandas(df)
assert_or_throw(
df.columns.dtype == "object",
ValueError("Pandas dataframe must have named schema"),
)

def get_fields() -> Iterable[pa.Field]:
if isinstance(df, pd.DataFrame) and len(df.index) > 0:
yield from pa.Schema.from_pandas(df, preserve_index=False)
else:
for i in range(df.shape[1]):
tp = df.dtypes[i]
if tp == np.dtype("object") or tp == np.dtype(str):
t = pa.string()
else:
t = pa.from_numpy_dtype(tp)
yield pa.field(df.columns[i], t)

fields: List[pa.Field] = []
for i in range(df.shape[1]):
tp = df.dtypes[i]
if tp == np.dtype("object") or tp == np.dtype(str):
t = pa.string()
for field in get_fields():
if pa.types.is_timestamp(field.type):
fields.append(pa.field(field.name, TRIAD_DEFAULT_TIMESTAMP))
else:
t = pa.from_numpy_dtype(tp)
fields.append(pa.field(df.columns[i], t))
fields.append(field)
return pa.schema(fields)

def enforce_type(self, df: T, schema: pa.Schema, null_safe: bool = False) -> T:
Expand Down
2 changes: 1 addition & 1 deletion triad/utils/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def _type_to_expression(dt: pa.DataType) -> str:
return _TYPE_EXPRESSION_R_MAPPING[dt]
if isinstance(dt, pa.TimestampType):
if dt.tz is None:
return f"timestamp({dt.unit})"
return "datetime"
else:
return f"timestamp({dt.unit},{dt.tz})"
if isinstance(dt, pa.Decimal128Type):
Expand Down
2 changes: 1 addition & 1 deletion triad_version/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.4"
__version__ = "0.4.5"

0 comments on commit be558f9

Please sign in to comment.