Use Python type name instead of Spark's in error messages. (#1985)

Addressing #1980 (comment) to add pandas dtypes.
databricks · Dec 26, 2020 · 1e51477 · 1e51477
1 parent 5f27857
commit 1e51477
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 25 deletions.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -10198,7 +10198,11 @@ def quantile(spark_column, spark_type):
             if isinstance(spark_type, (BooleanType, NumericType)):
                 return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
             else:
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} ({}) to numeric".format(
+                        spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                    )
+                )
 
         if isinstance(q, list):
             # First calculate the percentiles from all columns and map it to each `quantiles`

diff --git a/databricks/koalas/generic.py b/databricks/koalas/generic.py
@@ -44,7 +44,7 @@
 from databricks.koalas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer
 from databricks.koalas.internal import InternalFrame
 from databricks.koalas.spark import functions as SF
-from databricks.koalas.typedef import Scalar
+from databricks.koalas.typedef import Scalar, spark_type_to_pandas_dtype
 from databricks.koalas.utils import (
     is_name_like_tuple,
     is_name_like_value,
@@ -1133,7 +1133,11 @@ def mean(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} ({}) to numeric".format(
+                        spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                    )
+                )
             return F.mean(spark_column)
 
         return self._reduce_for_stat_function(
@@ -1208,7 +1212,11 @@ def sum(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} ({}) to numeric".format(
+                        spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                    )
+                )
             return F.coalesce(F.sum(spark_column), F.lit(0))
 
         return self._reduce_for_stat_function(
@@ -1294,7 +1302,11 @@ def prod(spark_column, spark_type):
                 if isinstance(spark_type, IntegralType):
                     scol = F.round(scol).cast(LongType())
             else:
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} ({}) to numeric".format(
+                        spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                    )
+                )
 
             return F.coalesce(scol, F.lit(1))
 
@@ -1345,7 +1357,11 @@ def skew(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} ({}) to numeric".format(
+                        spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                    )
+                )
             return F.skewness(spark_column)
 
         return self._reduce_for_stat_function(
@@ -1394,7 +1410,11 @@ def kurtosis(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} ({}) to numeric".format(
+                        spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                    )
+                )
             return F.kurtosis(spark_column)
 
         return self._reduce_for_stat_function(
@@ -1633,7 +1653,11 @@ def std(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} ({}) to numeric".format(
+                        spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                    )
+                )
             if ddof == 0:
                 return F.stddev_pop(spark_column)
             else:
@@ -1703,7 +1727,11 @@ def var(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} ({}) to numeric".format(
+                        spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                    )
+                )
             if ddof == 0:
                 return F.var_pop(spark_column)
             else:
@@ -1807,7 +1835,11 @@ def median(spark_column, spark_type):
             if isinstance(spark_type, (BooleanType, NumericType)):
                 return SF.percentile_approx(spark_column.cast(DoubleType()), 0.5, accuracy)
             else:
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} ({}) to numeric".format(
+                        spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                    )
+                )
 
         return self._reduce_for_stat_function(
             median, name="median", numeric_only=numeric_only, axis=axis
@@ -1885,7 +1917,10 @@ def abs(kser):
                 return kser.spark.transform(F.abs)
             else:
                 raise TypeError(
-                    "bad operand type for abs(): {}".format(kser.spark.data_type.simpleString())
+                    "bad operand type for abs(): {} ({})".format(
+                        spark_type_to_pandas_dtype(kser.spark.data_type),
+                        kser.spark.data_type.simpleString(),
+                    )
                 )
 
         return self._apply_series_op(abs)

diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -82,7 +82,13 @@
 from databricks.koalas.spark import functions as SF
 from databricks.koalas.spark.accessors import SparkSeriesMethods
 from databricks.koalas.strings import StringMethods
-from databricks.koalas.typedef import infer_return_type, SeriesType, ScalarType, Scalar
+from databricks.koalas.typedef import (
+    infer_return_type,
+    spark_type_to_pandas_dtype,
+    SeriesType,
+    ScalarType,
+    Scalar,
+)
 
 
 # This regular expression pattern is complied and defined here to avoid to compile the same
@@ -3302,7 +3308,9 @@ def quantile(spark_column, spark_type):
                     return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
                 else:
                     raise TypeError(
-                        "Could not convert {} to numeric".format(spark_type.simpleString())
+                        "Could not convert {} ({}) to numeric".format(
+                            spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
+                        )
                     )
 
             return self._reduce_for_stat_function(quantile, name="quantile")
@@ -5703,7 +5711,10 @@ def _cumsum(self, skipna, part_cols=()):
             kser = kser.spark.transform(lambda scol: scol.cast(LongType()))
         elif not isinstance(kser.spark.data_type, NumericType):
             raise TypeError(
-                "Could not convert {} to numeric".format(kser.spark.data_type.simpleString())
+                "Could not convert {} ({}) to numeric".format(
+                    spark_type_to_pandas_dtype(kser.spark.data_type),
+                    kser.spark.data_type.simpleString(),
+                )
             )
         return kser._cum(F.sum, skipna, part_cols)
 
@@ -5731,7 +5742,10 @@ def _cumprod(self, skipna, part_cols=()):
                 scol = F.round(scol).cast(LongType())
         else:
             raise TypeError(
-                "Could not convert {} to numeric".format(self.spark.data_type.simpleString())
+                "Could not convert {} ({}) to numeric".format(
+                    spark_type_to_pandas_dtype(self.spark.data_type),
+                    self.spark.data_type.simpleString(),
+                )
             )
 
         return self._with_new_scol(scol)

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -4283,9 +4283,9 @@ def test_quantile(self):
             self.assert_eq(kdf.quantile(0.5), pd.Series(name=0.5))
             self.assert_eq(kdf.quantile([0.25, 0.5, 0.75]), pd.DataFrame(index=[0.25, 0.5, 0.75]))
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
             kdf.quantile(0.5, numeric_only=False)
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
             kdf.quantile([0.25, 0.5, 0.75], numeric_only=False)
 
     def test_pct_change(self):

diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -1267,9 +1267,9 @@ def test_quantile(self):
         with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
             ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
             ks.Series(["a", "b", "c"]).quantile()
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
             ks.Series(["a", "b", "c"]).quantile([0.25, 0.5, 0.75])
 
     def test_idxmax(self):
@@ -2228,9 +2228,11 @@ def test_product(self):
         kser = ks.from_pandas(pser)
         self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1))
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
             ks.Series(["a", "b", "c"]).prod()
-        with self.assertRaisesRegex(TypeError, "Could not convert timestamp to numeric"):
+        with self.assertRaisesRegex(
+            TypeError, "Could not convert datetime64\\[ns\\] \\(timestamp\\) to numeric"
+        ):
             ks.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).prod()
 
     def test_hasnans(self):

diff --git a/databricks/koalas/tests/test_stats.py b/databricks/koalas/tests/test_stats.py
@@ -140,9 +140,13 @@ def test_abs(self):
         self.assert_eq(kdf[["B", "C"]].abs(), pdf[["B", "C"]].abs())
         self.assert_eq(kdf[["E"]].abs(), pdf[["E"]].abs())
 
-        with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
+        with self.assertRaisesRegex(
+            TypeError, "bad operand type for abs\\(\\): object \\(string\\)"
+        ):
             kdf.abs()
-        with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
+        with self.assertRaisesRegex(
+            TypeError, "bad operand type for abs\\(\\): object \\(string\\)"
+        ):
             kdf.D.abs()
 
     def test_axis_on_dataframe(self):
@@ -331,8 +335,8 @@ def test_numeric_only_unsupported(self):
                 pdf[["i", "b"]].sum(numeric_only=False).astype(int),
             )
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
             kdf.sum(numeric_only=False)
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
             kdf.s.sum()