From 1e514775e868d46fa3b1ddeebbfb239034226a1c Mon Sep 17 00:00:00 2001 From: Takuya UESHIN Date: Fri, 25 Dec 2020 23:33:38 -0800 Subject: [PATCH] Use Python type name instead of Spark's in error messages. (#1985) Addressing https://github.com/databricks/koalas/pull/1980#discussion_r547607957 to add pandas dtypes. --- databricks/koalas/frame.py | 6 ++- databricks/koalas/generic.py | 55 ++++++++++++++++++----- databricks/koalas/series.py | 22 +++++++-- databricks/koalas/tests/test_dataframe.py | 4 +- databricks/koalas/tests/test_series.py | 10 +++-- databricks/koalas/tests/test_stats.py | 12 +++-- 6 files changed, 84 insertions(+), 25 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index b2b4ac7017..6cf11f2ca6 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -10198,7 +10198,11 @@ def quantile(spark_column, spark_type): if isinstance(spark_type, (BooleanType, NumericType)): return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy) else: - raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString())) + raise TypeError( + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) + ) if isinstance(q, list): # First calculate the percentiles from all columns and map it to each `quantiles` diff --git a/databricks/koalas/generic.py b/databricks/koalas/generic.py index 81b3853474..c1ef4141a6 100644 --- a/databricks/koalas/generic.py +++ b/databricks/koalas/generic.py @@ -44,7 +44,7 @@ from databricks.koalas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer from databricks.koalas.internal import InternalFrame from databricks.koalas.spark import functions as SF -from databricks.koalas.typedef import Scalar +from databricks.koalas.typedef import Scalar, spark_type_to_pandas_dtype from databricks.koalas.utils import ( is_name_like_tuple, is_name_like_value, @@ -1133,7 +1133,11 @@ def mean(spark_column, spark_type): if isinstance(spark_type, BooleanType): spark_column = spark_column.cast(LongType()) elif not isinstance(spark_type, NumericType): - raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString())) + raise TypeError( + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) + ) return F.mean(spark_column) return self._reduce_for_stat_function( @@ -1208,7 +1212,11 @@ def sum(spark_column, spark_type): if isinstance(spark_type, BooleanType): spark_column = spark_column.cast(LongType()) elif not isinstance(spark_type, NumericType): - raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString())) + raise TypeError( + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) + ) return F.coalesce(F.sum(spark_column), F.lit(0)) return self._reduce_for_stat_function( @@ -1294,7 +1302,11 @@ def prod(spark_column, spark_type): if isinstance(spark_type, IntegralType): scol = F.round(scol).cast(LongType()) else: - raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString())) + raise TypeError( + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) + ) return F.coalesce(scol, F.lit(1)) @@ -1345,7 +1357,11 @@ def skew(spark_column, spark_type): if isinstance(spark_type, BooleanType): spark_column = spark_column.cast(LongType()) elif not isinstance(spark_type, NumericType): - raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString())) + raise TypeError( + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) + ) return F.skewness(spark_column) return self._reduce_for_stat_function( @@ -1394,7 +1410,11 @@ def kurtosis(spark_column, spark_type): if isinstance(spark_type, BooleanType): spark_column = spark_column.cast(LongType()) elif not isinstance(spark_type, NumericType): - raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString())) + raise TypeError( + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) + ) return F.kurtosis(spark_column) return self._reduce_for_stat_function( @@ -1633,7 +1653,11 @@ def std(spark_column, spark_type): if isinstance(spark_type, BooleanType): spark_column = spark_column.cast(LongType()) elif not isinstance(spark_type, NumericType): - raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString())) + raise TypeError( + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) + ) if ddof == 0: return F.stddev_pop(spark_column) else: @@ -1703,7 +1727,11 @@ def var(spark_column, spark_type): if isinstance(spark_type, BooleanType): spark_column = spark_column.cast(LongType()) elif not isinstance(spark_type, NumericType): - raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString())) + raise TypeError( + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) + ) if ddof == 0: return F.var_pop(spark_column) else: @@ -1807,7 +1835,11 @@ def median(spark_column, spark_type): if isinstance(spark_type, (BooleanType, NumericType)): return SF.percentile_approx(spark_column.cast(DoubleType()), 0.5, accuracy) else: - raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString())) + raise TypeError( + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) + ) return self._reduce_for_stat_function( median, name="median", numeric_only=numeric_only, axis=axis @@ -1885,7 +1917,10 @@ def abs(kser): return kser.spark.transform(F.abs) else: raise TypeError( - "bad operand type for abs(): {}".format(kser.spark.data_type.simpleString()) + "bad operand type for abs(): {} ({})".format( + spark_type_to_pandas_dtype(kser.spark.data_type), + kser.spark.data_type.simpleString(), + ) ) return self._apply_series_op(abs) diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index dcd728469a..3060905912 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -82,7 +82,13 @@ from databricks.koalas.spark import functions as SF from databricks.koalas.spark.accessors import SparkSeriesMethods from databricks.koalas.strings import StringMethods -from databricks.koalas.typedef import infer_return_type, SeriesType, ScalarType, Scalar +from databricks.koalas.typedef import ( + infer_return_type, + spark_type_to_pandas_dtype, + SeriesType, + ScalarType, + Scalar, +) # This regular expression pattern is complied and defined here to avoid to compile the same @@ -3302,7 +3308,9 @@ def quantile(spark_column, spark_type): return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy) else: raise TypeError( - "Could not convert {} to numeric".format(spark_type.simpleString()) + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() + ) ) return self._reduce_for_stat_function(quantile, name="quantile") @@ -5703,7 +5711,10 @@ def _cumsum(self, skipna, part_cols=()): kser = kser.spark.transform(lambda scol: scol.cast(LongType())) elif not isinstance(kser.spark.data_type, NumericType): raise TypeError( - "Could not convert {} to numeric".format(kser.spark.data_type.simpleString()) + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(kser.spark.data_type), + kser.spark.data_type.simpleString(), + ) ) return kser._cum(F.sum, skipna, part_cols) @@ -5731,7 +5742,10 @@ def _cumprod(self, skipna, part_cols=()): scol = F.round(scol).cast(LongType()) else: raise TypeError( - "Could not convert {} to numeric".format(self.spark.data_type.simpleString()) + "Could not convert {} ({}) to numeric".format( + spark_type_to_pandas_dtype(self.spark.data_type), + self.spark.data_type.simpleString(), + ) ) return self._with_new_scol(scol) diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 2af9e63056..dfda6b4f5b 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -4283,9 +4283,9 @@ def test_quantile(self): self.assert_eq(kdf.quantile(0.5), pd.Series(name=0.5)) self.assert_eq(kdf.quantile([0.25, 0.5, 0.75]), pd.DataFrame(index=[0.25, 0.5, 0.75])) - with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"): + with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): kdf.quantile(0.5, numeric_only=False) - with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"): + with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): kdf.quantile([0.25, 0.5, 0.75], numeric_only=False) def test_pct_change(self): diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py index 4c802ccd3b..4c1062148c 100644 --- a/databricks/koalas/tests/test_series.py +++ b/databricks/koalas/tests/test_series.py @@ -1267,9 +1267,9 @@ def test_quantile(self): with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"]) - with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"): + with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): ks.Series(["a", "b", "c"]).quantile() - with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"): + with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): ks.Series(["a", "b", "c"]).quantile([0.25, 0.5, 0.75]) def test_idxmax(self): @@ -2228,9 +2228,11 @@ def test_product(self): kser = ks.from_pandas(pser) self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1)) - with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"): + with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): ks.Series(["a", "b", "c"]).prod() - with self.assertRaisesRegex(TypeError, "Could not convert timestamp to numeric"): + with self.assertRaisesRegex( + TypeError, "Could not convert datetime64\\[ns\\] \\(timestamp\\) to numeric" + ): ks.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).prod() def test_hasnans(self): diff --git a/databricks/koalas/tests/test_stats.py b/databricks/koalas/tests/test_stats.py index 58d43bb269..8b6b208256 100644 --- a/databricks/koalas/tests/test_stats.py +++ b/databricks/koalas/tests/test_stats.py @@ -140,9 +140,13 @@ def test_abs(self): self.assert_eq(kdf[["B", "C"]].abs(), pdf[["B", "C"]].abs()) self.assert_eq(kdf[["E"]].abs(), pdf[["E"]].abs()) - with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"): + with self.assertRaisesRegex( + TypeError, "bad operand type for abs\\(\\): object \\(string\\)" + ): kdf.abs() - with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"): + with self.assertRaisesRegex( + TypeError, "bad operand type for abs\\(\\): object \\(string\\)" + ): kdf.D.abs() def test_axis_on_dataframe(self): @@ -331,8 +335,8 @@ def test_numeric_only_unsupported(self): pdf[["i", "b"]].sum(numeric_only=False).astype(int), ) - with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"): + with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): kdf.sum(numeric_only=False) - with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"): + with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): kdf.s.sum()