fugue-project · goodwanghan · May 27, 2020 · May 20, 2020 · May 20, 2020 · May 25, 2020
diff --git a/README.md b/README.md
@@ -17,6 +17,9 @@ pip install triad
 
 ## Release History
 
+### 0.3.5
+* Change pyarrow and pandas type_safe output to be consistent with pyarrow (None for pd.NaT, nan, etc)
+
 ### 0.3.4
 * Add general FileSystem
 

diff --git a/tests/utils/test_pandas_like.py b/tests/utils/test_pandas_like.py
@@ -55,6 +55,7 @@ def test_as_array_iterable():
     assert [["a", 1]] == df.as_array()
     assert [["a", 1]] == df.as_array(["a", "b"])
     assert [[1, "a"]] == df.as_array(["b", "a"])
+    assert [[1, "a"]] == df.as_array(["b", "a"], null_schema=True)
 
     # prevent pandas auto type casting
     df = DF([[1.0, 1.1]], "a:double,b:int")
@@ -70,15 +71,13 @@ def test_as_array_iterable():
     assert isinstance(df.as_array()[0][1], int)
 
     df = DF([[pd.Timestamp("2020-01-01"), 1.1]], "a:datetime,b:int")
-    df.native["a"] = pd.to_datetime(df.native["a"])
     assert [[datetime(2020, 1, 1), 1]] == df.as_array()
-    assert isinstance(df.as_array()[0][0], datetime)
-    assert isinstance(df.as_array()[0][1], int)
+    assert isinstance(df.as_array(type_safe=True)[0][0], datetime)
+    assert isinstance(df.as_array(type_safe=True)[0][1], int)
 
     df = DF([[pd.NaT, 1.1]], "a:datetime,b:int")
-    df.native["a"] = pd.to_datetime(df.native["a"])
-    assert isinstance(df.as_array()[0][0], datetime)
-    assert isinstance(df.as_array()[0][1], int)
+    assert df.as_array(type_safe=True)[0][0] is None
+    assert isinstance(df.as_array(type_safe=True)[0][1], int)
 
     df = DF([[1.0, 1.1]], "a:double,b:int")
     assert [[1.0, 1]] == df.as_array(type_safe=True)
@@ -87,35 +86,36 @@ def test_as_array_iterable():
 
 
 def test_nested():
+    # data = [[dict(b=[30, "40"])]]
+    # s = expression_to_schema("a:{a:str,b:[int]}")
+    # df = DF(data, "a:{a:str,b:[int]}")
+    # a = df.as_array(type_safe=True)
+    # assert [[dict(a=None, b=[30, 40])]] == a
+
     data = [[[json.dumps(dict(b=[30, "40"]))]]]
     s = expression_to_schema("a:[{a:str,b:[int]}]")
     df = DF(data, "a:[{a:str,b:[int]}]")
-    a = df.as_array(s, type_safe=True)
+    a = df.as_array(type_safe=True)
     assert [[[dict(a=None, b=[30, 40])]]] == a
 
     data = [[json.dumps(["1", 2])]]
     s = expression_to_schema("a:[int]")
     df = DF(data, "a:[int]")
-    a = df.as_array(s, type_safe=True)
+    a = df.as_array(type_safe=True)
     assert [[[1, 2]]] == a
 
 
 def test_nan_none():
     df = DF([[None, None]], "b:str,c:double", True)
     assert df.native.iloc[0, 0] is None
-    arr = df.as_array(null_safe=True)[0]
+    arr = df.as_array(type_safe=True)[0]
     assert arr[0] is None
-    assert math.isnan(arr[1])
+    assert arr[1] is None
 
     df = DF([[None, None]], "b:int,c:bool", True)
     arr = df.as_array(type_safe=True)[0]
-    assert np.isnan(arr[0])  # TODO: this will cause inconsistent behavior cross engine
-    assert np.isnan(arr[1])  # TODO: this will cause inconsistent behavior cross engine
-
-    df = DF([["a", 1.1], [None, None]], "b:str,c:double", True)
-    arr = df.as_array()[1]
     assert arr[0] is None
-    assert math.isnan(arr[1])
+    assert arr[1] is None
 
     df = DF([], "b:str,c:double", True)
     assert len(df.as_array()) == 0
@@ -156,19 +156,12 @@ def __init__(self, data, schema, enforce=False):
         s = expression_to_schema(schema)
         df = pd.DataFrame(data, columns=s.names)
         self.native = PD_UTILS.enforce_type(df, s, enforce)
+        self.schema = s
 
-    def as_array(self, cols=None, type_safe=False, null_safe=False):
-        if cols is None or isinstance(cols, pa.Schema):
-            return list(
-                PD_UTILS.as_array_iterable(
-                    self.native, schema=cols, type_safe=type_safe, null_safe=null_safe
-                )
-            )
-        if isinstance(cols, list):
-            os = PD_UTILS.to_schema(self.native)
-            s = pa.schema([os.field(x) for x in cols])
-            return list(
-                PD_UTILS.as_array_iterable(
-                    self.native, schema=s, type_safe=type_safe, null_safe=null_safe
-                )
+    def as_array(self, cols=None, type_safe=False, null_schema=False):
+        schema = None if null_schema else self.schema
+        return list(
+            PD_UTILS.as_array_iterable(
+                self.native, schema=schema, columns=cols, type_safe=type_safe
             )
+        )
diff --git a/tests/utils/test_pyarrow_convert.py b/tests/utils/test_pyarrow_convert.py
@@ -75,9 +75,9 @@ def test_convert_to_double():
     _test_convert("1.1", "double", 1.1)
     _test_convert(pd.NaT, "double", None)
     _assert_raise(pdt, "double")
-    _test_convert(FLOAT_NAN, "double", FLOAT_NAN)
-    _test_convert("nan", "double", FLOAT_NAN)
-    _test_convert("NaN", "double", FLOAT_NAN)
+    _test_convert(FLOAT_NAN, "double", None)
+    _test_convert("nan", "double", None)
+    _test_convert("NaN", "double", None)
     _test_convert(FLOAT_INF, "double", FLOAT_INF)
     _test_convert("inf", "double", FLOAT_INF)
     _test_convert("INF", "double", FLOAT_INF)
@@ -95,6 +95,8 @@ def test_convert_to_bool():
     pdt = pd.Timestamp("2020-01-01T02:03:04")
 
     _test_convert(None, "bool", None)
+    _test_convert(True, "bool", True)
+    _test_convert(False, "bool", False)
     _test_convert("true", "bool", True)
     _test_convert("True", "bool", True)
     _test_convert("false", "bool", False)
@@ -109,11 +111,11 @@ def test_convert_to_datetime():
     pdt = pd.Timestamp("2020-01-01T02:03:04")
     dt = datetime(2020, 1, 1, 2, 3, 4)
     d = date(2020, 1, 1)
-    _test_convert(None, "datetime", pd.NaT)
+    _test_convert(None, "datetime", None)
     _assert_raise("1", "datetime")
     _test_convert("2020-01-01 02:03:04", "datetime", dt)
     _test_convert("2020-01-01", "datetime", datetime(2020, 1, 1))
-    _test_convert(pd.NaT, "datetime", pd.NaT)
+    _test_convert(pd.NaT, "datetime", None)
     _test_convert(pdt, "datetime", dt)
     assert isinstance(_to_pydatetime(pdt), datetime)
     assert not isinstance(_to_pydatetime(pdt), pd.Timestamp)
@@ -126,11 +128,11 @@ def test_convert_to_date():
     pdt = pd.Timestamp("2020-01-01T02:03:04")
     dt = datetime(2020, 1, 1, 2, 3, 4)
     d = date(2020, 1, 1)
-    _test_convert(None, "date", pd.NaT)
+    _test_convert(None, "date", None)
     _assert_raise("1", "date")
     _test_convert("2020-01-01 02:03:04", "date", d)
     _test_convert("2020-01-01", "date", d)
-    _test_convert(pd.NaT, "date", pd.NaT)
+    _test_convert(pd.NaT, "date", None)
     _test_convert(pdt, "date", d)
     assert isinstance(_to_pydate(pdt), date)
     assert not isinstance(_to_pydate(pdt), pd.Timestamp)

diff --git a/triad/__init__.py b/triad/__init__.py
@@ -1 +1 @@
-__version__ = "0.3.4"
+__version__ = "0.3.5"
diff --git a/triad/utils/pandas_like.py b/triad/utils/pandas_like.py
@@ -17,41 +17,53 @@ def empty(self, df: T) -> bool:
         """
         return len(df.index) == 0
 
+    def as_arrow(self, df: T, schema: Optional[pa.Schema] = None) -> pa.Table:
+        """Convert pandas like dataframe to pyarrow table
+
+        :param df: pandas like dataframe
+        :param schema: if specified, it will be used to construct pyarrow table,
+          defaults to None
+
+        :return: pyarrow table
+        """
+        return pa.Table.from_pandas(df, schema=schema, preserve_index=False, safe=False)
+
     def as_array_iterable(
         self,
         df: T,
         schema: Optional[pa.Schema] = None,
+        columns: Optional[List[str]] = None,
         type_safe: bool = False,
-        null_safe: bool = False,
     ) -> Iterable[List[Any]]:
         """Convert pandas like dataframe to iterable of rows in the format of list.
 
         :param df: pandas like dataframe
-        :param schema: columns and types for output.
-            Leave it None to return all columns in original type
-        :param type_safe: whether to enforce the types in schema, if not, it will
+        :param schema: schema of the input. With None, it will infer the schema,
+          it can't infer wrong schema for nested types, so try to be explicit
+        :param columns: columns to output, None for all columns
+        :param type_safe: whether to enforce the types in schema, if False, it will
             return the original values from the dataframe
-        :param null_safe: whether to ensure returning null for nan or null values in
-            columns with type int, bool and string
         :return: iterable of rows, each row is a list
 
         :Notice:
-        * `null_safe` by default is False, for non pandas dataframe, setting it to
-        True may cause errors
-        * If there are nested types in schema, the conversion can be a lot slower
+        If there are nested types in schema, the conversion can be slower
         """
         if self.empty(df):
             return
         if schema is None:
             schema = self.to_schema(df)
-        else:
-            df = df[schema.names]
-            orig = self.to_schema(df)
-            if not orig.equals(schema):
-                df = self.enforce_type(df, schema, null_safe)
-        if not type_safe or all(not pa.types.is_nested(x) for x in schema.types):
+        if columns is not None:
+            df = df[columns]
+            schema = pa.schema([schema.field(n) for n in columns])
+        if not type_safe:
             for arr in df.itertuples(index=False, name=None):
                 yield list(arr)
+        elif all(not pa.types.is_nested(x) for x in schema.types):
+            p = self.as_arrow(df, schema)
+            d = p.to_pydict()
+            cols = [d[n] for n in schema.names]
+            for arr in zip(*cols):
+                yield list(arr)
         else:
             # If schema has nested types, the conversion will be much slower
             for arr in apply_schema(

diff --git a/triad/utils/pyarrow.py b/triad/utils/pyarrow.py
@@ -388,10 +388,10 @@ def _to_pynone(obj: Any) -> Any:
 
 
 def _to_pyint(obj: Any) -> Any:
-    if obj is None or isinstance(obj, int):
-        return obj
-    if obj != obj:  # NaN
+    if obj is None or obj != obj:  # NaN
         return None
+    if isinstance(obj, int):
+        return obj
     return as_type(obj, int)
 
 
@@ -402,41 +402,44 @@ def _to_pystr(obj: Any) -> Any:
 
 
 def _to_pybool(obj: Any) -> Any:
-    if obj is None or isinstance(obj, bool):
-        return obj
-    if obj != obj:  # NaN
+    if obj is None or obj != obj:  # NaN
         return None
+    if isinstance(obj, bool):
+        return obj
     return as_type(obj, bool)
 
 
 def _to_pyfloat(obj: Any) -> Any:
-    if obj is None or isinstance(obj, float):
-        return obj
-    if obj != obj:  # NaN
+    if obj is None or obj != obj:  # NaN
         return None
-    return as_type(obj, float)
+    if isinstance(obj, float):
+        return obj
+    obj = as_type(obj, float)
+    return None if obj != obj else obj
 
 
 def _to_pydatetime(obj: Any) -> Any:
     if obj is None or obj is pd.NaT:
-        return pd.NaT
+        return None
     if isinstance(obj, pd.Timestamp):
         return obj.to_pydatetime()
     if isinstance(obj, datetime):
         return obj
-    return as_type(obj, datetime)
+    obj = as_type(obj, datetime)
+    return None if obj != obj else obj
 
 
 def _to_pydate(obj: Any) -> Any:
     if obj is None or obj is pd.NaT:
-        return pd.NaT
+        return None
     if isinstance(obj, pd.Timestamp):
         return obj.to_pydatetime().date()
     if isinstance(obj, datetime):
         return obj.date()
     if isinstance(obj, date):
         return obj
-    return as_type(obj, datetime).date()
+    obj = as_type(obj, datetime).date()
+    return None if obj != obj else obj
 
 
 def _assert_pytype(pytype: type, obj: Any) -> Any: