From 6da920c61a458883d7aaade76ecde2f7d5c1b666 Mon Sep 17 00:00:00 2001 From: harupy Date: Fri, 8 Nov 2019 12:28:41 +0900 Subject: [PATCH 01/10] Fix _column_op --- databricks/koalas/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 4647f7051e..8b512ef122 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -57,6 +57,12 @@ def wrapper(self, *args): args = [arg._scol if isinstance(arg, IndexOpsMixin) else arg for arg in args] scol = f(self._scol, *args) + # If f is a logistic operator, fill NULL with False + log_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt'] + is_log_op = any(f == getattr(spark.Column, f'__{log_op}__') for log_op in log_ops) + if is_log_op: + scol = F.when(scol.isNull(), False).otherwise(scol) + return self._with_new_scol(scol) else: # Different DataFrame anchors From 7e51addfcf6398d2b56fd6c4496064d0b0f71ae8 Mon Sep 17 00:00:00 2001 From: harupy Date: Fri, 8 Nov 2019 15:21:28 +0900 Subject: [PATCH 02/10] Handle not-equal operator --- databricks/koalas/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 8b512ef122..d9e2d550d2 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -61,7 +61,8 @@ def wrapper(self, *args): log_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt'] is_log_op = any(f == getattr(spark.Column, f'__{log_op}__') for log_op in log_ops) if is_log_op: - scol = F.when(scol.isNull(), False).otherwise(scol) + filler = f != spark.Column.__ne__ + scol = F.when(scol.isNull(), filler).otherwise(scol) return self._with_new_scol(scol) else: From 0b9229e3fdf66edfd7447a0d00783e984626bfdc Mon Sep 17 00:00:00 2001 From: harupy Date: Fri, 8 Nov 2019 15:23:17 +0900 Subject: [PATCH 03/10] Fix --- databricks/koalas/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index d9e2d550d2..eb24b3d401 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -61,7 +61,7 @@ def wrapper(self, *args): log_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt'] is_log_op = any(f == getattr(spark.Column, f'__{log_op}__') for log_op in log_ops) if is_log_op: - filler = f != spark.Column.__ne__ + filler = f == spark.Column.__ne__ scol = F.when(scol.isNull(), filler).otherwise(scol) return self._with_new_scol(scol) From e27d6aab4eea96ff6618d479c9876d6328fd3347 Mon Sep 17 00:00:00 2001 From: harupy Date: Fri, 8 Nov 2019 16:03:50 +0900 Subject: [PATCH 04/10] Fix doctests --- databricks/koalas/frame.py | 42 ++++++++++++++++---------------- databricks/koalas/series.py | 48 ++++++++++++++++++------------------- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 8b7a715d1f..2094d9713f 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -749,11 +749,11 @@ def eq(self, other): ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) >>> df.eq(1) - a b - a True True - b False None - c False True - d False None + a b + a True True + b False False + c False True + d False False """ return self == other @@ -770,9 +770,9 @@ def gt(self, other): >>> df.gt(2) a b a False False - b False None + b False False c True False - d True None + d True False """ return self > other @@ -785,11 +785,11 @@ def ge(self, other): ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) >>> df.ge(1) - a b - a True True - b True None - c True True - d True None + a b + a True True + b True False + c True True + d True False """ return self >= other @@ -804,9 +804,9 @@ def lt(self, other): >>> df.lt(1) a b a False False - b False None + b False False c False False - d False None + d False False """ return self < other @@ -819,11 +819,11 @@ def le(self, other): ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) >>> df.le(2) - a b - a True True - b True None - c False True - d False None + a b + a True True + b True False + c False True + d False False """ return self <= other @@ -838,9 +838,9 @@ def ne(self, other): >>> df.ne(1) a b a False False - b True None + b True True c True False - d True None + d True True """ return self != other diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 53200c8c86..6cc74a9d1c 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -539,11 +539,11 @@ def eq(self, other): Name: a, dtype: bool >>> df.b.eq(1) - a True - b None - c True - d None - Name: b, dtype: object + a True + b False + c True + d False + Name: b, dtype: bool """ return (self == other).rename(self.name) @@ -566,10 +566,10 @@ def gt(self, other): >>> df.b.gt(1) a False - b None + b False c False - d None - Name: b, dtype: object + d False + Name: b, dtype: bool """ return (self > other).rename(self.name) @@ -590,10 +590,10 @@ def ge(self, other): >>> df.b.ge(2) a False - b None + b False c False - d None - Name: b, dtype: object + d False + Name: b, dtype: bool """ return (self >= other).rename(self.name) @@ -613,11 +613,11 @@ def lt(self, other): Name: a, dtype: bool >>> df.b.lt(2) - a True - b None - c True - d None - Name: b, dtype: object + a True + b False + c True + d False + Name: b, dtype: bool """ return (self < other).rename(self.name) @@ -637,11 +637,11 @@ def le(self, other): Name: a, dtype: bool >>> df.b.le(2) - a True - b None - c True - d None - Name: b, dtype: object + a True + b False + c True + d False + Name: b, dtype: bool """ return (self <= other).rename(self.name) @@ -662,10 +662,10 @@ def ne(self, other): >>> df.b.ne(1) a False - b None + b True c False - d None - Name: b, dtype: object + d True + Name: b, dtype: bool """ return (self != other).rename(self.name) From da80958be1b06f7e32c4c76bfcebe4af80ea39d3 Mon Sep 17 00:00:00 2001 From: harupy Date: Fri, 8 Nov 2019 16:56:10 +0900 Subject: [PATCH 05/10] Remove f-string --- databricks/koalas/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index eb24b3d401..6d6493994a 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -59,7 +59,8 @@ def wrapper(self, *args): # If f is a logistic operator, fill NULL with False log_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt'] - is_log_op = any(f == getattr(spark.Column, f'__{log_op}__') for log_op in log_ops) + is_log_op = any(f == getattr(spark.Column, '__{}__'.format(log_op)) + for log_op in log_ops) if is_log_op: filler = f == spark.Column.__ne__ scol = F.when(scol.isNull(), filler).otherwise(scol) From e1b93fb493f6c10174a9d5bea6d199ae636933c4 Mon Sep 17 00:00:00 2001 From: harupy Date: Wed, 20 Nov 2019 02:43:49 +0900 Subject: [PATCH 06/10] Fix for __or__ and __and__ --- databricks/koalas/base.py | 9 ++++++++- databricks/koalas/tests/test_series.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 6d6493994a..4aee603fe4 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -57,14 +57,21 @@ def wrapper(self, *args): args = [arg._scol if isinstance(arg, IndexOpsMixin) else arg for arg in args] scol = f(self._scol, *args) - # If f is a logistic operator, fill NULL with False + # If f is a logistic operator, fill NULL log_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt'] is_log_op = any(f == getattr(spark.Column, '__{}__'.format(log_op)) for log_op in log_ops) + if is_log_op: filler = f == spark.Column.__ne__ scol = F.when(scol.isNull(), filler).otherwise(scol) + elif f == spark.Column.__or__: + scol = F.when(self._scol.isNull() | scol.isNull(), False).otherwise(scol) + + elif f == spark.Column.__and__: + scol = F.when(scol.isNull(), False).otherwise(scol) + return self._with_new_scol(scol) else: # Different DataFrame anchors diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py index e5fa79e596..a02641bfa4 100644 --- a/databricks/koalas/tests/test_series.py +++ b/databricks/koalas/tests/test_series.py @@ -177,6 +177,26 @@ def test_values_property(self): with self.assertRaises(NotImplementedError, msg=msg): kser.values + def test_or(self): + pdf = pd.DataFrame({ + 'left': [True, False, True, False, np.nan, np.nan, True, False, np.nan], + 'right': [True, False, False, True, True, False, np.nan, np.nan, np.nan] + }) + kdf = ks.from_pandas(pdf) + + self.assert_eq(pdf['left'] | pdf['right'], + kdf['left'] | kdf['right']) + + def test_and(self): + pdf = pd.DataFrame({ + 'left': [True, False, True, False, np.nan, np.nan, True, False, np.nan], + 'right': [True, False, False, True, True, False, np.nan, np.nan, np.nan] + }) + kdf = ks.from_pandas(pdf) + + self.assert_eq(pdf['left'] & pdf['right'], + kdf['left'] & kdf['right']) + def test_to_numpy(self): pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name='x') From 473a1a8cacd8b19462591d04317f2df7ec1369db Mon Sep 17 00:00:00 2001 From: harupy Date: Wed, 20 Nov 2019 02:58:16 +0900 Subject: [PATCH 07/10] Fix --- databricks/koalas/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 4aee603fe4..5646066160 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -57,7 +57,7 @@ def wrapper(self, *args): args = [arg._scol if isinstance(arg, IndexOpsMixin) else arg for arg in args] scol = f(self._scol, *args) - # If f is a logistic operator, fill NULL + # check if f is a logistic operator log_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt'] is_log_op = any(f == getattr(spark.Column, '__{}__'.format(log_op)) for log_op in log_ops) From 93e6505fd5d0f296d180c1ad5ac3da86c7783d21 Mon Sep 17 00:00:00 2001 From: Harutaka Kawamura Date: Wed, 20 Nov 2019 13:27:03 +0900 Subject: [PATCH 08/10] Fix comments --- databricks/koalas/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 5646066160..9b76dd800b 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -197,7 +197,7 @@ def __rfloordiv__(self, other): __pow__ = _column_op(spark.Column.__pow__) __rpow__ = _column_op(spark.Column.__rpow__) - # logistic operators + # comparison operators __eq__ = _column_op(spark.Column.__eq__) __ne__ = _column_op(spark.Column.__ne__) __lt__ = _column_op(spark.Column.__lt__) From 880c4dd934cec118eb921ebac09cb2d7f4324c89 Mon Sep 17 00:00:00 2001 From: Harutaka Kawamura Date: Wed, 20 Nov 2019 13:37:05 +0900 Subject: [PATCH 09/10] Fix comments --- databricks/koalas/base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 9b76dd800b..6d11d07565 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -57,12 +57,12 @@ def wrapper(self, *args): args = [arg._scol if isinstance(arg, IndexOpsMixin) else arg for arg in args] scol = f(self._scol, *args) - # check if f is a logistic operator - log_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt'] - is_log_op = any(f == getattr(spark.Column, '__{}__'.format(log_op)) - for log_op in log_ops) + # check if `f` is a comparison operator + comp_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt'] + is_comp_op = any(f == getattr(spark.Column, '__{}__'.format(comp_op)) + for comp_op in comp_ops) - if is_log_op: + if is_comp_op: filler = f == spark.Column.__ne__ scol = F.when(scol.isNull(), filler).otherwise(scol) From 000d3fdc74fd2850fb7c3ce4e691b19b658bba8d Mon Sep 17 00:00:00 2001 From: Harutaka Kawamura Date: Wed, 20 Nov 2019 13:47:11 +0900 Subject: [PATCH 10/10] Fix indent --- databricks/koalas/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 6d11d07565..f44058c09e 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -60,7 +60,7 @@ def wrapper(self, *args): # check if `f` is a comparison operator comp_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt'] is_comp_op = any(f == getattr(spark.Column, '__{}__'.format(comp_op)) - for comp_op in comp_ops) + for comp_op in comp_ops) if is_comp_op: filler = f == spark.Column.__ne__