Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix .loc of dataframe with nullable boolean dtype #8368

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 4 additions & 3 deletions dask/dataframe/indexing.py
Expand Up @@ -4,6 +4,7 @@

import numpy as np
import pandas as pd
from pandas.api.types import is_bool_dtype

from ..array.core import Array
from ..base import tokenize
Expand Down Expand Up @@ -114,14 +115,14 @@ def _loc(self, iindexer, cindexer):
return self._loc_slice(iindexer, cindexer)
elif isinstance(iindexer, (list, np.ndarray)):
return self._loc_list(iindexer, cindexer)
elif is_series_like(iindexer) and iindexer.dtype != bool:
elif is_series_like(iindexer) and not is_bool_dtype(iindexer.dtype):
return self._loc_list(iindexer.values, cindexer)
else:
# element should raise KeyError
return self._loc_element(iindexer, cindexer)
else:
if isinstance(iindexer, (list, np.ndarray)) or (
is_series_like(iindexer) and iindexer.dtype != bool
is_series_like(iindexer) and not is_bool_dtype(iindexer.dtype)
):
# applying map_partitions to each partition
# results in duplicated NaN rows
Expand All @@ -148,7 +149,7 @@ def _maybe_partial_time_string(self, iindexer):
return iindexer

def _loc_series(self, iindexer, cindexer):
if iindexer.dtype != bool:
if not is_bool_dtype(iindexer.dtype):
raise KeyError(
"Cannot index with non-boolean dask Series. Try passing computed "
"values instead (e.g. ``ddf.loc[iindexer.compute()]``)"
Expand Down
11 changes: 11 additions & 0 deletions dask/dataframe/tests/test_indexing.py
Expand Up @@ -671,3 +671,14 @@ def test_iloc_out_of_order_selection():
assert a1.name == "C"
assert b1.name == "A"
assert c1.name == "B"


def test_pandas_nullable_boolean_data_type():
s1 = pd.Series([0, 1, 2])
s2 = pd.Series([True, False, pd.NA], dtype="boolean")

ddf1 = dd.from_pandas(s1, npartitions=1)
ddf2 = dd.from_pandas(s2, npartitions=1)

assert_eq(ddf1[ddf2], s1[s2])
assert_eq(ddf1.loc[ddf2], s1.loc[s2])