Skip to content

Commit

Permalink
Use Python type name instead of Spark's in error messages. (#1985)
Browse files Browse the repository at this point in the history
Addressing #1980 (comment) to add pandas dtypes.
  • Loading branch information
ueshin committed Dec 26, 2020
1 parent 5f27857 commit 1e51477
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 25 deletions.
6 changes: 5 additions & 1 deletion databricks/koalas/frame.py
Expand Up @@ -10198,7 +10198,11 @@ def quantile(spark_column, spark_type):
if isinstance(spark_type, (BooleanType, NumericType)):
return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)

if isinstance(q, list):
# First calculate the percentiles from all columns and map it to each `quantiles`
Expand Down
55 changes: 45 additions & 10 deletions databricks/koalas/generic.py
Expand Up @@ -44,7 +44,7 @@
from databricks.koalas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer
from databricks.koalas.internal import InternalFrame
from databricks.koalas.spark import functions as SF
from databricks.koalas.typedef import Scalar
from databricks.koalas.typedef import Scalar, spark_type_to_pandas_dtype
from databricks.koalas.utils import (
is_name_like_tuple,
is_name_like_value,
Expand Down Expand Up @@ -1133,7 +1133,11 @@ def mean(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
return F.mean(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1208,7 +1212,11 @@ def sum(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
return F.coalesce(F.sum(spark_column), F.lit(0))

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1294,7 +1302,11 @@ def prod(spark_column, spark_type):
if isinstance(spark_type, IntegralType):
scol = F.round(scol).cast(LongType())
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)

return F.coalesce(scol, F.lit(1))

Expand Down Expand Up @@ -1345,7 +1357,11 @@ def skew(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
return F.skewness(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1394,7 +1410,11 @@ def kurtosis(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
return F.kurtosis(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1633,7 +1653,11 @@ def std(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
if ddof == 0:
return F.stddev_pop(spark_column)
else:
Expand Down Expand Up @@ -1703,7 +1727,11 @@ def var(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
if ddof == 0:
return F.var_pop(spark_column)
else:
Expand Down Expand Up @@ -1807,7 +1835,11 @@ def median(spark_column, spark_type):
if isinstance(spark_type, (BooleanType, NumericType)):
return SF.percentile_approx(spark_column.cast(DoubleType()), 0.5, accuracy)
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)

return self._reduce_for_stat_function(
median, name="median", numeric_only=numeric_only, axis=axis
Expand Down Expand Up @@ -1885,7 +1917,10 @@ def abs(kser):
return kser.spark.transform(F.abs)
else:
raise TypeError(
"bad operand type for abs(): {}".format(kser.spark.data_type.simpleString())
"bad operand type for abs(): {} ({})".format(
spark_type_to_pandas_dtype(kser.spark.data_type),
kser.spark.data_type.simpleString(),
)
)

return self._apply_series_op(abs)
Expand Down
22 changes: 18 additions & 4 deletions databricks/koalas/series.py
Expand Up @@ -82,7 +82,13 @@
from databricks.koalas.spark import functions as SF
from databricks.koalas.spark.accessors import SparkSeriesMethods
from databricks.koalas.strings import StringMethods
from databricks.koalas.typedef import infer_return_type, SeriesType, ScalarType, Scalar
from databricks.koalas.typedef import (
infer_return_type,
spark_type_to_pandas_dtype,
SeriesType,
ScalarType,
Scalar,
)


# This regular expression pattern is complied and defined here to avoid to compile the same
Expand Down Expand Up @@ -3302,7 +3308,9 @@ def quantile(spark_column, spark_type):
return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
else:
raise TypeError(
"Could not convert {} to numeric".format(spark_type.simpleString())
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)

return self._reduce_for_stat_function(quantile, name="quantile")
Expand Down Expand Up @@ -5703,7 +5711,10 @@ def _cumsum(self, skipna, part_cols=()):
kser = kser.spark.transform(lambda scol: scol.cast(LongType()))
elif not isinstance(kser.spark.data_type, NumericType):
raise TypeError(
"Could not convert {} to numeric".format(kser.spark.data_type.simpleString())
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(kser.spark.data_type),
kser.spark.data_type.simpleString(),
)
)
return kser._cum(F.sum, skipna, part_cols)

Expand Down Expand Up @@ -5731,7 +5742,10 @@ def _cumprod(self, skipna, part_cols=()):
scol = F.round(scol).cast(LongType())
else:
raise TypeError(
"Could not convert {} to numeric".format(self.spark.data_type.simpleString())
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(self.spark.data_type),
self.spark.data_type.simpleString(),
)
)

return self._with_new_scol(scol)
Expand Down
4 changes: 2 additions & 2 deletions databricks/koalas/tests/test_dataframe.py
Expand Up @@ -4283,9 +4283,9 @@ def test_quantile(self):
self.assert_eq(kdf.quantile(0.5), pd.Series(name=0.5))
self.assert_eq(kdf.quantile([0.25, 0.5, 0.75]), pd.DataFrame(index=[0.25, 0.5, 0.75]))

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
kdf.quantile(0.5, numeric_only=False)
with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
kdf.quantile([0.25, 0.5, 0.75], numeric_only=False)

def test_pct_change(self):
Expand Down
10 changes: 6 additions & 4 deletions databricks/koalas/tests/test_series.py
Expand Up @@ -1267,9 +1267,9 @@ def test_quantile(self):
with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ks.Series(["a", "b", "c"]).quantile()
with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ks.Series(["a", "b", "c"]).quantile([0.25, 0.5, 0.75])

def test_idxmax(self):
Expand Down Expand Up @@ -2228,9 +2228,11 @@ def test_product(self):
kser = ks.from_pandas(pser)
self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1))

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ks.Series(["a", "b", "c"]).prod()
with self.assertRaisesRegex(TypeError, "Could not convert timestamp to numeric"):
with self.assertRaisesRegex(
TypeError, "Could not convert datetime64\\[ns\\] \\(timestamp\\) to numeric"
):
ks.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).prod()

def test_hasnans(self):
Expand Down
12 changes: 8 additions & 4 deletions databricks/koalas/tests/test_stats.py
Expand Up @@ -140,9 +140,13 @@ def test_abs(self):
self.assert_eq(kdf[["B", "C"]].abs(), pdf[["B", "C"]].abs())
self.assert_eq(kdf[["E"]].abs(), pdf[["E"]].abs())

with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
with self.assertRaisesRegex(
TypeError, "bad operand type for abs\\(\\): object \\(string\\)"
):
kdf.abs()
with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
with self.assertRaisesRegex(
TypeError, "bad operand type for abs\\(\\): object \\(string\\)"
):
kdf.D.abs()

def test_axis_on_dataframe(self):
Expand Down Expand Up @@ -331,8 +335,8 @@ def test_numeric_only_unsupported(self):
pdf[["i", "b"]].sum(numeric_only=False).astype(int),
)

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
kdf.sum(numeric_only=False)

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
kdf.s.sum()

0 comments on commit 1e51477

Please sign in to comment.