Skip to content

Commit

Permalink
pandas like utils handle str->bool (#60)
Browse files Browse the repository at this point in the history
  • Loading branch information
goodwanghan committed Nov 28, 2020
1 parent be558f9 commit 87fe33a
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 10 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,49 +19,67 @@ pip install triad

## Release History

### 0.4.6

* Improve pandas like utils `enforce` method to handle str -> bool

### 0.4.5

* Fixed pandas -> arrow datetime conversion issue

### 0.4.4

* Improved FileSystem compatibility with Windows
* Add overwrite expression for Schema class
* Fixed github actions

### 0.4.3

* Refactored `str_to_type`, `str_to_instance` and `to_function` to use `eval`

### 0.4.2

* Fix a bug in pandas like safe_groupby_apply

### 0.4.1

* Improvement on group by apply
* Improvement on environment setup

### 0.4.0

* Prepare for Fugue open source

### 0.3.8

* Change to Apache 2.0 license

### 0.3.7

* Add pyarrow binary type support

### 0.3.6

* Add `transform` to Schema class

### 0.3.5

* Change pyarrow and pandas type_safe output to be consistent with pyarrow (None for pd.NaT, nan, etc)

### 0.3.4

* Add general FileSystem

### 0.3.3

* Add thread-safe cloudpicklable RunOnce class

### 0.3.2

* extracted TRIAD_DEFAULT_TIMESTAMP as a constant

### <=0.3.1

* Open sourced and docs are ready
* Added basic utility functions
* Types and schema are based on pyarrow
Expand Down
14 changes: 14 additions & 0 deletions tests/utils/test_pandas_like.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,20 @@ def test_nan_none():
assert len(df.as_array()) == 0


def test_boolean_enforce():
df = DF([[1, True], [2, False], [3, None]], "b:int,c:bool", True)
arr = df.as_array(type_safe=True)
assert [[1, True], [2, False], [3, None]] == arr

df = DF([[1, 1], [2, 0]], "b:int,c:bool", True)
arr = df.as_array(type_safe=True)
assert [[1, True], [2, False]] == arr

df = DF([[1, "trUe"], [2, "False"], [3, None]], "b:int,c:bool", True)
arr = df.as_array(type_safe=True)
assert [[1, True], [2, False], [3, None]] == arr


def test_fillna_default():
df = pd.DataFrame([["a"], [None]], columns=["x"])
s = PD_UTILS.fillna_default(df["x"])
Expand Down
30 changes: 21 additions & 9 deletions triad/utils/pandas_like.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ def get_fields() -> Iterable[pa.Field]:
fields.append(field)
return pa.schema(fields)

def enforce_type(self, df: T, schema: pa.Schema, null_safe: bool = False) -> T:
def enforce_type( # noqa: C901
self, df: T, schema: pa.Schema, null_safe: bool = False
) -> T:
"""Enforce the pandas like dataframe to comply with `schema`.
:param df: pandas like dataframe
Expand All @@ -134,6 +136,8 @@ def enforce_type(self, df: T, schema: pa.Schema, null_safe: bool = False) -> T:
each value in the column is either None or an integer, however, due to the
behavior of pandas like dataframes, the type of the columns may
no longer be `int64`
This method does not enforce struct and list types
"""
if self.empty(df):
return df
Expand All @@ -142,14 +146,22 @@ def enforce_type(self, df: T, schema: pa.Schema, null_safe: bool = False) -> T:
for v in schema:
s = df[v.name]
if pa.types.is_string(v.type):
ns = s[s.isnull()].index.tolist()
s = s.astype(str)
s.iloc[ns] = None
elif pa.types.is_integer(v.type) or pa.types.is_boolean(v.type):
ns = s[s.isnull()].index.tolist()
s = s.fillna(0).astype(v.type.to_pandas_dtype())
s.iloc[ns] = None
elif not pa.types.is_struct(v.type):
ns = s.isnull()
s = s.astype(str).mask(ns, None)
elif pa.types.is_boolean(v.type):
ns = s.isnull()
if pd.api.types.is_string_dtype(s.dtype):
try:
s = s.str.lower() == "true"
except AttributeError:
s = s.fillna(0).astype(bool)
else:
s = s.fillna(0).astype(bool)
s = s.mask(ns, None)
elif pa.types.is_integer(v.type):
ns = s.isnull()
s = s.fillna(0).astype(v.type.to_pandas_dtype()).mask(ns, None)
elif not pa.types.is_struct(v.type) and not pa.types.is_list(v.type):
s = s.astype(v.type.to_pandas_dtype())
df[v.name] = s
return df
Expand Down
2 changes: 1 addition & 1 deletion triad_version/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.5"
__version__ = "0.4.6"

0 comments on commit 87fe33a

Please sign in to comment.