Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the input check in Series. #1814

Merged
merged 5 commits into from
Oct 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
67 changes: 21 additions & 46 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@
from databricks.koalas.ml import corr
from databricks.koalas.utils import (
combine_frames,
is_name_like_tuple,
is_name_like_value,
name_like_string,
same_anchor,
scol_for,
Expand Down Expand Up @@ -1900,10 +1902,7 @@ def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None)
return self

def drop(
self,
labels=None,
index: Union[str, Tuple[str, ...], List[str], List[Tuple[str, ...]]] = None,
level=None,
self, labels=None, index: Union[Any, Tuple, List[Any], List[Tuple]] = None, level=None
):
"""
Return Series with specified index labels removed.
Expand Down Expand Up @@ -2015,47 +2014,31 @@ def drop(
return first_series(self._drop(labels=labels, index=index, level=level))

def _drop(
self,
labels=None,
index: Union[str, Tuple[str, ...], List[str], List[Tuple[str, ...]]] = None,
level=None,
self, labels=None, index: Union[Any, Tuple, List[Any], List[Tuple]] = None, level=None
):
level_param = level
if labels is not None:
if index is not None:
raise ValueError("Cannot specify both 'labels' and 'index'")
return self._drop(index=labels, level=level)
if index is not None:
internal = self._internal
if not isinstance(index, (str, tuple, list)):
raise ValueError("'index' type should be one of str, list, tuple")
if level is None:
level = 0
if level >= len(internal.index_spark_columns):
raise ValueError("'level' should be less than the number of indexes")

if isinstance(index, str):
index = [(index,)] # type: ignore
elif isinstance(index, tuple):
if is_name_like_tuple(index): # type: ignore
index = [index]
else:
if not (
all((isinstance(idxes, str) for idxes in index))
or all((isinstance(idxes, tuple) for idxes in index))
):
raise ValueError(
"If the given index is a list, it "
"should only contains names as strings, "
"or a list of tuples that contain "
"index names as strings"
)
new_index = []
for idxes in index:
if isinstance(idxes, tuple):
new_index.append(idxes)
else:
new_index.append((idxes,))
index = new_index
elif is_name_like_value(index):
index = [(index,)]
elif all(is_name_like_value(idxes, allow_tuple=False) for idxes in index):
index = [(idex,) for idex in index]
elif not all(is_name_like_tuple(idxes) for idxes in index):
raise ValueError(
"If the given index is a list, it "
"should only contains names as all tuples or all non tuples "
"that contain index names"
)

drop_index_scols = []
for idxes in index:
Expand All @@ -2065,14 +2048,11 @@ def _drop(
for lvl, idx in enumerate(idxes, level)
]
except IndexError:
if level_param is None:
raise KeyError(
"Key length ({}) exceeds index depth ({})".format(
len(internal.index_spark_columns), len(idxes)
)
raise KeyError(
"Key length ({}) exceeds index depth ({})".format(
len(internal.index_spark_columns), len(idxes)
)
else:
return self._kdf[[self.name]]
)
drop_index_scols.append(reduce(lambda x, y: x & y, index_scols))

cond = ~reduce(lambda x, y: x | y, drop_index_scols)
Expand Down Expand Up @@ -3753,15 +3733,10 @@ def pop(self, item):
(b, falcon, speed) 1.0
dtype: float64
"""
if not isinstance(item, (str, tuple)):
if not is_name_like_value(item):
raise ValueError("'key' should be string or tuple that contains strings")
if isinstance(item, str):
if not is_name_like_tuple(item):
item = (item,)
if not all(isinstance(index, str) for index in item):
raise ValueError(
"'key' should have index names as only strings "
"or a tuple that contain index names as only strings"
)
if len(self._internal._index_map) < len(item):
raise KeyError(
"Key length ({}) exceeds index depth ({})".format(
Expand Down
39 changes: 22 additions & 17 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,37 +1146,51 @@ def test_aggregate(self):
def test_drop(self):
pser = pd.Series([10, 20, 15, 30, 45], name="x")
kser = ks.Series(pser)

self.assert_eq(kser.drop(1), pser.drop(1))
self.assert_eq(kser.drop([1, 4]), pser.drop([1, 4]))

msg = "Need to specify at least one of 'labels' or 'index'"
with self.assertRaisesRegex(ValueError, msg):
kser.drop()
self.assertRaises(KeyError, lambda: kser.drop((0, 1)))

# For MultiIndex
midx = pd.MultiIndex(
[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
)
kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
kser = ks.from_pandas(pser)

self.assert_eq(kser.drop("lama"), pser.drop("lama"))
self.assert_eq(kser.drop(labels="weight", level=1), pser.drop(labels="weight", level=1))
self.assert_eq(kser.drop(("lama", "weight")), pser.drop(("lama", "weight")))
self.assert_eq(
kser.drop([("lama", "speed"), ("falcon", "weight")]),
pser.drop([("lama", "speed"), ("falcon", "weight")]),
)
self.assert_eq(kser.drop({"lama": "speed"}), pser.drop({"lama": "speed"}))

msg = "'level' should be less than the number of indexes"
with self.assertRaisesRegex(ValueError, msg):
kser.drop(labels="weight", level=2)

msg = (
"If the given index is a list, it "
"should only contains names as strings, "
"or a list of tuples that contain "
"index names as strings"
"should only contains names as all tuples or all non tuples "
"that contain index names"
)
with self.assertRaisesRegex(ValueError, msg):
kser.drop(["lama", ["cow", "falcon"]])
msg = "'index' type should be one of str, list, tuple"
with self.assertRaisesRegex(ValueError, msg):
kser.drop({"lama": "speed"})

msg = "Cannot specify both 'labels' and 'index'"
with self.assertRaisesRegex(ValueError, msg):
kser.drop("lama", index="cow")

msg = r"'Key length \(2\) exceeds index depth \(3\)'"
with self.assertRaisesRegex(KeyError, msg):
kser.drop(("lama", "speed", "x"))
self.assert_eq(kser.drop(("lama", "speed", "x"), level=1), kser)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@itholic Do you remember the reason of this behavior?

Copy link
Contributor

@itholic itholic Oct 5, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... I don't remember clearly but I'd say maybe It seems that there's no special reason for it - I think I just had not enough understood at the time of working on this.😅 -
Anyway, it seems that this case should be fixed to raise an error.


def test_pop(self):
midx = pd.MultiIndex(
Expand All @@ -1193,15 +1207,6 @@ def test_pop(self):
self.assert_eq(kser, pser)
self.assert_eq(kdf, pdf)

msg = "'key' should be string or tuple that contains strings"
with self.assertRaisesRegex(ValueError, msg):
kser.pop(0)
msg = (
"'key' should have index names as only strings "
"or a tuple that contain index names as only strings"
)
with self.assertRaisesRegex(ValueError, msg):
kser.pop(("lama", 0))
msg = r"'Key length \(3\) exceeds index depth \(2\)'"
with self.assertRaisesRegex(KeyError, msg):
kser.pop(("lama", "speed", "x"))
Expand Down