Fix datetime issue (#58)

* Schema overwrite, and fix actions * update * update * add one more test * fix action * fix datetime type handling in pyarrow and pandas * update * update
fugue-project · Nov 6, 2020 · be558f9 · be558f9
1 parent c0cd69f
commit be558f9
Show file tree

Hide file tree

Showing 7 changed files with 73 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -19,6 +19,9 @@ pip install triad
 
 ## Release History
 
+### 0.4.5
+* Fixed pandas -> arrow datetime conversion issue
+
 ### 0.4.4
 * Improved FileSystem compatibility with Windows
 * Add overwrite expression for Schema class

diff --git a/tests/collections/test_schema.py b/tests/collections/test_schema.py
@@ -1,6 +1,6 @@
 import math
 from collections import OrderedDict
-from datetime import datetime
+from datetime import datetime, date
 
 import pandas as pd
 import pyarrow as pa
@@ -33,6 +33,13 @@ def test_schema_init():
     assert 0 == len(Schema(""))
 
 
+def test_schema_datetime():
+    df = pd.DataFrame(
+        [[datetime(2020, 1, 1, 2, 3, 4, 5), date(2020, 2, 2)]], columns=["a", "b"]
+    )
+    assert Schema(df) == "a:datetime,b:date"
+
+
 def test_schema_properties():
     s = Schema("a:int,b:str")
     assert ["a", "b"] == s.names

diff --git a/tests/utils/test_pandas_like.py b/tests/utils/test_pandas_like.py
@@ -33,10 +33,20 @@ def test_to_schema():
     df = df.astype(dtype={"x": np.int32, "y": np.dtype("str")})
     assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))
 
+    # timestamp test
+    df = pd.DataFrame(
+        [[datetime(2020, 1, 1, 2, 3, 4, 5), date(2020, 2, 2)]], columns=["a", "b"]
+    )
+    assert list(expression_to_schema("a:datetime,b:date")) == list(
+        PD_UTILS.to_schema(df)
+    )
+
     # test index
     df = pd.DataFrame([[3.0, 2], [2.0, 3]], columns=["x", "y"])
     df = df.sort_values(["x"])
-    assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))
+    assert list(pa.Schema.from_pandas(df, preserve_index=False)) == list(
+        PD_UTILS.to_schema(df)
+    )
     df.index.name = "x"
     raises(ValueError, lambda: PD_UTILS.to_schema(df))
     df = df.reset_index(drop=True)
@@ -86,6 +96,27 @@ def test_as_array_iterable():
     assert isinstance(df.as_array()[0][1], int)
 
 
+def test_as_array_iterable_datetime():
+    df = pd.DataFrame(
+        [[datetime(2020, 1, 1, 2, 3, 4, 5), date(2020, 2, 2)]], columns=["a", "b"]
+    )
+    v1 = list(PD_UTILS.as_array_iterable(df, type_safe=True))[0]
+    v2 = list(
+        PD_UTILS.as_array_iterable(
+            df, schema=expression_to_schema("a:datetime,b:date"), type_safe=True
+        )
+    )[0]
+    assert v1[0] == v2[0]
+    assert not isinstance(v1[0], pd.Timestamp)
+    assert type(v1[0]) == datetime
+    assert type(v1[0]) == type(v2[0])
+
+    assert v1[1] == v2[1]
+    assert not isinstance(v1[1], pd.Timestamp)
+    assert type(v1[1]) == date
+    assert type(v1[1]) == type(v2[1])
+
+
 def test_nested():
     # data = [[dict(b=[30, "40"])]]
     # s = expression_to_schema("a:{a:str,b:[int]}")

diff --git a/tests/utils/test_pyarrow.py b/tests/utils/test_pyarrow.py
@@ -74,7 +74,10 @@ def test__type_to_expression():
     assert "timestamp(ns,America/New_York)" == _type_to_expression(
         pa.timestamp("ns", "America/New_York")
     )
-    assert "timestamp(s)" == _type_to_expression(pa.timestamp("s"))
+    assert "datetime" == _type_to_expression(pa.timestamp("s"))
+    assert "datetime" == _type_to_expression(pa.timestamp("ns"))
+    assert "datetime" == _type_to_expression(pa.timestamp("ms"))
+    assert "datetime" == _type_to_expression(pa.timestamp("us"))
     assert "decimal(5)" == _type_to_expression(pa.decimal128(5))
     assert "decimal(5,2)" == _type_to_expression(pa.decimal128(5, 2))
     assert "bytes" == _type_to_expression(pa.binary())
@@ -153,7 +156,7 @@ def test_get_eq_func():
         for n in [None, float("nan"), float("inf"), float("-inf")]:
             assert not get_eq_func(t)(None, 1.1)
             assert get_eq_func(t)(None, None)
-    for t in [pa.timestamp("ns")]:
+    for t in [pa.timestamp("ns"), pa.timestamp("us")]:
         for n in [None, pd.NaT]:
             assert not get_eq_func(t)(datetime(2020, 1, 1, 0), datetime(2020, 1, 1, 1))
             assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1))

diff --git a/triad/utils/pandas_like.py b/triad/utils/pandas_like.py
@@ -4,7 +4,8 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from triad.utils.pyarrow import apply_schema, to_pandas_dtype
+from triad.utils.assertion import assert_or_throw
+from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP, apply_schema, to_pandas_dtype
 
 T = TypeVar("T", bound=Any)
 _DEFAULT_JOIN_KEYS: List[str] = []
@@ -94,18 +95,29 @@ def to_schema(self, df: T) -> pa.Schema:
         or pd.UInt64Index and without a name, otherwise, `ValueError` will raise.
         """
         self.ensure_compatible(df)
-        if df.columns.dtype != "object":
-            raise ValueError("Pandas dataframe must have named schema")
-        if isinstance(df, pd.DataFrame) and len(df.index) > 0:
-            return pa.Schema.from_pandas(df)
+        assert_or_throw(
+            df.columns.dtype == "object",
+            ValueError("Pandas dataframe must have named schema"),
+        )
+
+        def get_fields() -> Iterable[pa.Field]:
+            if isinstance(df, pd.DataFrame) and len(df.index) > 0:
+                yield from pa.Schema.from_pandas(df, preserve_index=False)
+            else:
+                for i in range(df.shape[1]):
+                    tp = df.dtypes[i]
+                    if tp == np.dtype("object") or tp == np.dtype(str):
+                        t = pa.string()
+                    else:
+                        t = pa.from_numpy_dtype(tp)
+                    yield pa.field(df.columns[i], t)
+
         fields: List[pa.Field] = []
-        for i in range(df.shape[1]):
-            tp = df.dtypes[i]
-            if tp == np.dtype("object") or tp == np.dtype(str):
-                t = pa.string()
+        for field in get_fields():
+            if pa.types.is_timestamp(field.type):
+                fields.append(pa.field(field.name, TRIAD_DEFAULT_TIMESTAMP))
             else:
-                t = pa.from_numpy_dtype(tp)
-            fields.append(pa.field(df.columns[i], t))
+                fields.append(field)
         return pa.schema(fields)
 
     def enforce_type(self, df: T, schema: pa.Schema, null_safe: bool = False) -> T:

diff --git a/triad/utils/pyarrow.py b/triad/utils/pyarrow.py
@@ -309,7 +309,7 @@ def _type_to_expression(dt: pa.DataType) -> str:
         return _TYPE_EXPRESSION_R_MAPPING[dt]
     if isinstance(dt, pa.TimestampType):
         if dt.tz is None:
-            return f"timestamp({dt.unit})"
+            return "datetime"
         else:
             return f"timestamp({dt.unit},{dt.tz})"
     if isinstance(dt, pa.Decimal128Type):

diff --git a/triad_version/__init__.py b/triad_version/__init__.py
@@ -1 +1 @@
-__version__ = "0.4.4"
+__version__ = "0.4.5"