Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nan, nat -> None for pyarrow and pandas output #38

Merged
merged 13 commits into from
May 27, 2020
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ pip install triad

## Release History

### 0.3.5
* Change pyarrow and pandas type_safe output to be consistent with pyarrow (None for pd.NaT, nan, etc)

### 0.3.4
* Add general FileSystem

Expand Down
53 changes: 23 additions & 30 deletions tests/utils/test_pandas_like.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def test_as_array_iterable():
assert [["a", 1]] == df.as_array()
assert [["a", 1]] == df.as_array(["a", "b"])
assert [[1, "a"]] == df.as_array(["b", "a"])
assert [[1, "a"]] == df.as_array(["b", "a"], null_schema=True)

# prevent pandas auto type casting
df = DF([[1.0, 1.1]], "a:double,b:int")
Expand All @@ -70,15 +71,13 @@ def test_as_array_iterable():
assert isinstance(df.as_array()[0][1], int)

df = DF([[pd.Timestamp("2020-01-01"), 1.1]], "a:datetime,b:int")
df.native["a"] = pd.to_datetime(df.native["a"])
assert [[datetime(2020, 1, 1), 1]] == df.as_array()
assert isinstance(df.as_array()[0][0], datetime)
assert isinstance(df.as_array()[0][1], int)
assert isinstance(df.as_array(type_safe=True)[0][0], datetime)
assert isinstance(df.as_array(type_safe=True)[0][1], int)

df = DF([[pd.NaT, 1.1]], "a:datetime,b:int")
df.native["a"] = pd.to_datetime(df.native["a"])
assert isinstance(df.as_array()[0][0], datetime)
assert isinstance(df.as_array()[0][1], int)
assert df.as_array(type_safe=True)[0][0] is None
assert isinstance(df.as_array(type_safe=True)[0][1], int)

df = DF([[1.0, 1.1]], "a:double,b:int")
assert [[1.0, 1]] == df.as_array(type_safe=True)
Expand All @@ -87,35 +86,36 @@ def test_as_array_iterable():


def test_nested():
# data = [[dict(b=[30, "40"])]]
# s = expression_to_schema("a:{a:str,b:[int]}")
# df = DF(data, "a:{a:str,b:[int]}")
# a = df.as_array(type_safe=True)
# assert [[dict(a=None, b=[30, 40])]] == a

data = [[[json.dumps(dict(b=[30, "40"]))]]]
s = expression_to_schema("a:[{a:str,b:[int]}]")
df = DF(data, "a:[{a:str,b:[int]}]")
a = df.as_array(s, type_safe=True)
a = df.as_array(type_safe=True)
assert [[[dict(a=None, b=[30, 40])]]] == a

data = [[json.dumps(["1", 2])]]
s = expression_to_schema("a:[int]")
df = DF(data, "a:[int]")
a = df.as_array(s, type_safe=True)
a = df.as_array(type_safe=True)
assert [[[1, 2]]] == a


def test_nan_none():
df = DF([[None, None]], "b:str,c:double", True)
assert df.native.iloc[0, 0] is None
arr = df.as_array(null_safe=True)[0]
arr = df.as_array(type_safe=True)[0]
assert arr[0] is None
assert math.isnan(arr[1])
assert arr[1] is None

df = DF([[None, None]], "b:int,c:bool", True)
arr = df.as_array(type_safe=True)[0]
assert np.isnan(arr[0]) # TODO: this will cause inconsistent behavior cross engine
assert np.isnan(arr[1]) # TODO: this will cause inconsistent behavior cross engine

df = DF([["a", 1.1], [None, None]], "b:str,c:double", True)
arr = df.as_array()[1]
assert arr[0] is None
assert math.isnan(arr[1])
assert arr[1] is None

df = DF([], "b:str,c:double", True)
assert len(df.as_array()) == 0
Expand Down Expand Up @@ -156,19 +156,12 @@ def __init__(self, data, schema, enforce=False):
s = expression_to_schema(schema)
df = pd.DataFrame(data, columns=s.names)
self.native = PD_UTILS.enforce_type(df, s, enforce)
self.schema = s

def as_array(self, cols=None, type_safe=False, null_safe=False):
if cols is None or isinstance(cols, pa.Schema):
return list(
PD_UTILS.as_array_iterable(
self.native, schema=cols, type_safe=type_safe, null_safe=null_safe
)
)
if isinstance(cols, list):
os = PD_UTILS.to_schema(self.native)
s = pa.schema([os.field(x) for x in cols])
return list(
PD_UTILS.as_array_iterable(
self.native, schema=s, type_safe=type_safe, null_safe=null_safe
)
def as_array(self, cols=None, type_safe=False, null_schema=False):
schema = None if null_schema else self.schema
return list(
PD_UTILS.as_array_iterable(
self.native, schema=schema, columns=cols, type_safe=type_safe
)
)
16 changes: 9 additions & 7 deletions tests/utils/test_pyarrow_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ def test_convert_to_double():
_test_convert("1.1", "double", 1.1)
_test_convert(pd.NaT, "double", None)
_assert_raise(pdt, "double")
_test_convert(FLOAT_NAN, "double", FLOAT_NAN)
_test_convert("nan", "double", FLOAT_NAN)
_test_convert("NaN", "double", FLOAT_NAN)
_test_convert(FLOAT_NAN, "double", None)
_test_convert("nan", "double", None)
_test_convert("NaN", "double", None)
_test_convert(FLOAT_INF, "double", FLOAT_INF)
_test_convert("inf", "double", FLOAT_INF)
_test_convert("INF", "double", FLOAT_INF)
Expand All @@ -95,6 +95,8 @@ def test_convert_to_bool():
pdt = pd.Timestamp("2020-01-01T02:03:04")

_test_convert(None, "bool", None)
_test_convert(True, "bool", True)
_test_convert(False, "bool", False)
_test_convert("true", "bool", True)
_test_convert("True", "bool", True)
_test_convert("false", "bool", False)
Expand All @@ -109,11 +111,11 @@ def test_convert_to_datetime():
pdt = pd.Timestamp("2020-01-01T02:03:04")
dt = datetime(2020, 1, 1, 2, 3, 4)
d = date(2020, 1, 1)
_test_convert(None, "datetime", pd.NaT)
_test_convert(None, "datetime", None)
_assert_raise("1", "datetime")
_test_convert("2020-01-01 02:03:04", "datetime", dt)
_test_convert("2020-01-01", "datetime", datetime(2020, 1, 1))
_test_convert(pd.NaT, "datetime", pd.NaT)
_test_convert(pd.NaT, "datetime", None)
_test_convert(pdt, "datetime", dt)
assert isinstance(_to_pydatetime(pdt), datetime)
assert not isinstance(_to_pydatetime(pdt), pd.Timestamp)
Expand All @@ -126,11 +128,11 @@ def test_convert_to_date():
pdt = pd.Timestamp("2020-01-01T02:03:04")
dt = datetime(2020, 1, 1, 2, 3, 4)
d = date(2020, 1, 1)
_test_convert(None, "date", pd.NaT)
_test_convert(None, "date", None)
_assert_raise("1", "date")
_test_convert("2020-01-01 02:03:04", "date", d)
_test_convert("2020-01-01", "date", d)
_test_convert(pd.NaT, "date", pd.NaT)
_test_convert(pd.NaT, "date", None)
_test_convert(pdt, "date", d)
assert isinstance(_to_pydate(pdt), date)
assert not isinstance(_to_pydate(pdt), pd.Timestamp)
Expand Down
2 changes: 1 addition & 1 deletion triad/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.4"
__version__ = "0.3.5"
42 changes: 27 additions & 15 deletions triad/utils/pandas_like.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,41 +17,53 @@ def empty(self, df: T) -> bool:
"""
return len(df.index) == 0

def as_arrow(self, df: T, schema: Optional[pa.Schema] = None) -> pa.Table:
"""Convert pandas like dataframe to pyarrow table

:param df: pandas like dataframe
:param schema: if specified, it will be used to construct pyarrow table,
defaults to None

:return: pyarrow table
"""
return pa.Table.from_pandas(df, schema=schema, preserve_index=False, safe=False)

def as_array_iterable(
self,
df: T,
schema: Optional[pa.Schema] = None,
columns: Optional[List[str]] = None,
type_safe: bool = False,
null_safe: bool = False,
) -> Iterable[List[Any]]:
"""Convert pandas like dataframe to iterable of rows in the format of list.

:param df: pandas like dataframe
:param schema: columns and types for output.
Leave it None to return all columns in original type
:param type_safe: whether to enforce the types in schema, if not, it will
:param schema: schema of the input. With None, it will infer the schema,
it can't infer wrong schema for nested types, so try to be explicit
:param columns: columns to output, None for all columns
:param type_safe: whether to enforce the types in schema, if False, it will
return the original values from the dataframe
:param null_safe: whether to ensure returning null for nan or null values in
columns with type int, bool and string
:return: iterable of rows, each row is a list

:Notice:
* `null_safe` by default is False, for non pandas dataframe, setting it to
True may cause errors
* If there are nested types in schema, the conversion can be a lot slower
If there are nested types in schema, the conversion can be slower
"""
if self.empty(df):
return
if schema is None:
schema = self.to_schema(df)
else:
df = df[schema.names]
orig = self.to_schema(df)
if not orig.equals(schema):
df = self.enforce_type(df, schema, null_safe)
if not type_safe or all(not pa.types.is_nested(x) for x in schema.types):
if columns is not None:
df = df[columns]
schema = pa.schema([schema.field(n) for n in columns])
if not type_safe:
for arr in df.itertuples(index=False, name=None):
yield list(arr)
elif all(not pa.types.is_nested(x) for x in schema.types):
p = self.as_arrow(df, schema)
d = p.to_pydict()
cols = [d[n] for n in schema.names]
for arr in zip(*cols):
yield list(arr)
else:
# If schema has nested types, the conversion will be much slower
for arr in apply_schema(
Expand Down
31 changes: 17 additions & 14 deletions triad/utils/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,10 @@ def _to_pynone(obj: Any) -> Any:


def _to_pyint(obj: Any) -> Any:
if obj is None or isinstance(obj, int):
return obj
if obj != obj: # NaN
if obj is None or obj != obj: # NaN
return None
if isinstance(obj, int):
return obj
return as_type(obj, int)


Expand All @@ -402,41 +402,44 @@ def _to_pystr(obj: Any) -> Any:


def _to_pybool(obj: Any) -> Any:
if obj is None or isinstance(obj, bool):
return obj
if obj != obj: # NaN
if obj is None or obj != obj: # NaN
return None
if isinstance(obj, bool):
return obj
return as_type(obj, bool)


def _to_pyfloat(obj: Any) -> Any:
if obj is None or isinstance(obj, float):
return obj
if obj != obj: # NaN
if obj is None or obj != obj: # NaN
return None
return as_type(obj, float)
if isinstance(obj, float):
return obj
obj = as_type(obj, float)
return None if obj != obj else obj


def _to_pydatetime(obj: Any) -> Any:
if obj is None or obj is pd.NaT:
return pd.NaT
return None
if isinstance(obj, pd.Timestamp):
return obj.to_pydatetime()
if isinstance(obj, datetime):
return obj
return as_type(obj, datetime)
obj = as_type(obj, datetime)
return None if obj != obj else obj


def _to_pydate(obj: Any) -> Any:
if obj is None or obj is pd.NaT:
return pd.NaT
return None
if isinstance(obj, pd.Timestamp):
return obj.to_pydatetime().date()
if isinstance(obj, datetime):
return obj.date()
if isinstance(obj, date):
return obj
return as_type(obj, datetime).date()
obj = as_type(obj, datetime).date()
return None if obj != obj else obj


def _assert_pytype(pytype: type, obj: Any) -> Any:
Expand Down