From c8c0d49c6ee62c793479dbcfefdd51385924dc45 Mon Sep 17 00:00:00 2001 From: chenkai02 Date: Fri, 6 Nov 2020 13:51:22 +0000 Subject: [PATCH 1/3] support sample --- databricks/koalas/frame.py | 152 +++++++++++++---- databricks/koalas/tests/test_dataframe.py | 197 ++++++++++++++++++++-- 2 files changed, 310 insertions(+), 39 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index dd80728230..479936bd32 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -7194,7 +7194,9 @@ def sample( n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, + weights: Optional[Any] = None, random_state: Optional[int] = None, + axis: Optional[Any] = None, ) -> "DataFrame": """ Return a random sample of items from an axis of object. @@ -7215,14 +7217,34 @@ def sample( Fraction of axis items to return. replace : bool, default False Sample with or without replacement. + weights : str or ndarray-like, optional + Default 'None' results in equal probability weighting. + If passed a Series, will align with target object on index. Index + values in weights not found in sampled object will be ignored and + index values in sampled object not in weights will be assigned + weights of zero. + If called on a DataFrame, will accept the name of a column + when axis = 0. + Unless weights are a Series, weights must be same length as axis + being sampled. + If weights do not sum to 1, they will be normalized to sum to 1. + Missing values in the weights column will be treated as zero. + Infinite values not allowed. random_state : int, optional Seed for the random number generator (if int). + axis : {0 or ‘index’, 1 or ‘columns’, None}, default None + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type (0 for Series and DataFrames). Returns ------- Series or DataFrame A new object of same type as caller containing the sampled items. + Notes + ----- + If `frac` > 1, `replacement` should be set to `True`. + Examples -------- >>> df = ks.DataFrame({'num_legs': [2, 4, 8, 0], @@ -7237,46 +7259,118 @@ def sample( spider 8 0 1 fish 0 0 8 - A random 25% sample of the ``DataFrame``. + Extract 3 random elements from the ``Series`` ``df['num_legs']``: Note that we use `random_state` to ensure the reproducibility of the examples. - >>> df.sample(frac=0.25, random_state=1) # doctest: +SKIP - num_legs num_wings num_specimen_seen - falcon 2 2 10 - fish 0 0 8 - - Extract 25% random elements from the ``Series`` ``df['num_legs']``, with replacement, - so the same items could appear more than once. - - >>> df['num_legs'].sample(frac=0.4, replace=True, random_state=1) # doctest: +SKIP + >>> df['num_legs'].sample(n=3, random_state=1).sort_index() falcon 2 - spider 8 + fish 0 spider 8 Name: num_legs, dtype: int64 - Specifying the exact number of items to return is not supported at the moment. + A random 50% sample of the ``DataFrame`` with replacement: - >>> df.sample(n=5) # doctest: +ELLIPSIS - Traceback (most recent call last): - ... - NotImplementedError: Function sample currently does not support specifying ... + >>> df.sample(frac=0.5, replace=True, random_state=1).sort_index() + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 """ - # Note: we don't run any of the doctests because the result can change depending on the - # system's core count. - if n is not None: - raise NotImplementedError( - "Function sample currently does not support specifying " - "exact number of items to return. Use frac instead." - ) + if axis in ("index", "rows", 0, None): + axis = 0 + elif axis in ("columns", 1): + raise NotImplementedError("Function sample currently does not support axis=1.") + else: + raise ValueError("No axis named %s for object type %s." % (axis, type(axis))) - if frac is None: - raise ValueError("frac must be specified.") + axis_length = self.shape[axis] - sdf = self._internal.resolved_copy.spark_frame.sample( - withReplacement=replace, fraction=frac, seed=random_state - ) - return DataFrame(self._internal.with_new_sdf(sdf)) + # Process random_state argument + import pandas.core.common as com + + rs = com.random_state(random_state) + + # Check weights for compliance + if weights is not None: + + # If a series, align with frame + if isinstance(weights, ks.Series): + weights = weights.reindex(self.axes[axis]) + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, str): + if isinstance(self, ks.DataFrame): + if axis == 0: + try: + weights = self[weights] + except KeyError as err: + raise KeyError("String passed to weights not a valid column") from err + else: + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) + else: + raise ValueError( + "Strings cannot be passed as weights " "when sampling from a Series." + ) + + # Because ks.Series currently does not support the Series.__iter__ method, + # It cannot be initialized to the pandas Series, so here is to_pandas. + if isinstance(weights, ks.Series): + weights = pd.Series(weights.to_pandas(), dtype="float64") + else: + weights = pd.Series(weights, dtype="float64") + + if len(weights) != axis_length: + raise ValueError("Weights and axis to be sampled must be of same length") + + if (weights == np.inf).any() or (weights == -np.inf).any(): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + + # If has nan, set to zero. + weights = weights.fillna(0) + + # Renormalize if don't sum to 1 + if weights.sum() != 1: + if weights.sum() != 0: + weights = weights / weights.sum() + else: + raise ValueError("Invalid weights: weights sum to zero") + + weights = weights._values + + # If no frac or n, default to n=1. + if n is None and frac is None: + n = 1 + elif frac is not None and frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " "upsampling the population `frac` > 1." + ) + elif n is not None and frac is None and n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + elif n is None and frac is not None: + n = int(round(frac * axis_length)) + elif n is not None and frac is not None: + raise ValueError("Please enter a value for `frac` OR `n`, not both") + + # Check for negative sizes + if n < 0: + raise ValueError("A negative number of rows requested. Please provide positive value.") + + # Because duplicated row selection is not currently supported. + # So if frac > 1, use the pyspark implementation. + if frac is not None and frac > 1: + sdf = self._internal.resolved_copy.spark_frame.sample( + withReplacement=replace, fraction=frac, seed=random_state + ) + return DataFrame(self._internal.with_new_sdf(sdf)) + locs = rs.choice(axis_length, size=n, replace=replace, p=weights) + return self.take(locs, axis=axis) def astype(self, dtype) -> "DataFrame": """ diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index b55b5c4265..b54f634cfe 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -2028,21 +2028,198 @@ def test_binary_operator_multiply(self): self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 0.1 * kdf["a"]) def test_sample(self): - pdf = pd.DataFrame({"A": [0, 2, 4]}) + # A few dataframe test with degenerate weights. + easy_weight_list = [0] * 10 + easy_weight_list[5] = 1 + + pdf = pd.DataFrame( + { + "col1": range(10, 20), + "col2": range(20, 30), + "colString": ["a"] * 10, + "easyweights": easy_weight_list, + } + ) kdf = ks.from_pandas(pdf) - # Make sure the tests run, but we can't check the result because they are non-deterministic. - kdf.sample(frac=0.1) - kdf.sample(frac=0.2, replace=True) - kdf.sample(frac=0.2, random_state=5) - kdf["A"].sample(frac=0.2) - kdf["A"].sample(frac=0.2, replace=True) - kdf["A"].sample(frac=0.2, random_state=5) + self.assert_eq( + kdf.sample(n=1, weights="easyweights"), pdf.sample(n=1, weights="easyweights"), + ) + + # Test that function aligns weights with frame + pdf = pd.DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) + pser = pd.Series([1, 0, 0], index=[3, 5, 9]) + + kdf = ks.from_pandas(pdf) + kser = ks.from_pandas(pser) + self.assert_eq(kdf.sample(1, weights=kser), pdf.sample(1, weights=pser)) + + # Weights have index values to be dropped because not in + # sampled DataFrame + pser2 = pd.Series([0.001, 0, 10000], index=[3, 5, 10]) + kser2 = ks.from_pandas(pser2) + self.assert_eq(kdf.sample(1, weights=kser2), pdf.sample(1, weights=pser2)) + + # Weights have empty values to be filed with zeros + pser3 = pd.Series([0.01, 0], index=[3, 5]) + kser3 = ks.from_pandas(pser3) + self.assert_eq(kdf.sample(1, weights=kser3), pdf.sample(1, weights=pser3)) + + # No overlap in weight and sampled DataFrame indices + pser4 = pd.Series([1, 0], index=[1, 2]) + kser4 = ks.from_pandas(pser4) + with self.assertRaises(ValueError): + kdf.sample(1, weights=kser4) + + ### + # Check behavior of random_state argument + ### + + # Check for stability when receives seed or random state -- run 10 + # times. + for test in range(10): + seed = np.random.randint(0, 100) + self.assert_eq(kdf.sample(n=2, random_state=seed), kdf.sample(n=2, random_state=seed)) + self.assert_eq( + kdf.sample(frac=0.7, random_state=seed), kdf.sample(frac=0.7, random_state=seed) + ) + self.assert_eq( + kdf.sample(n=2, random_state=np.random.RandomState(test)), + kdf.sample(n=2, random_state=np.random.RandomState(test)), + ) + self.assert_eq( + kdf.sample(frac=0.7, random_state=np.random.RandomState(test)), + kdf.sample(frac=0.7, random_state=np.random.RandomState(test)), + ) + + # Check for error when random_state argument invalid. + with self.assertRaises(ValueError): + kdf.sample(random_state="astring!") + + ### + # Check behavior of `frac` and `N` + ### + + # Giving both frac and N throws error + with self.assertRaises(ValueError): + kdf.sample(n=3, frac=0.3) + + # Check that raises right error for negative lengths + with self.assertRaises(ValueError): + kdf.sample(n=-3) + with self.assertRaises(ValueError): + kdf.sample(frac=-0.3) + + # Make sure float values of `n` give error + with self.assertRaises(ValueError): + kdf.sample(n=3.2) + + # Check lengths are right + assert len(kdf.sample(n=2) == 2) + assert len(kdf.sample(frac=0.34) == 1) + assert len(kdf.sample(frac=0.48) == 2) + + ### + # Check weights + ### + + # Weight length must be right + with self.assertRaises(ValueError): + kdf.sample(n=3, weights=[0, 1]) + + with self.assertRaises(ValueError): + bad_weights = [0.5] * 11 + kdf.sample(n=3, weights=bad_weights) + + with self.assertRaises(ValueError): + bad_weight_series = ks.Series([0, 0, 0.2]) + kdf.sample(n=4, weights=bad_weight_series) + + # Check won't accept negative weights + with self.assertRaises(ValueError): + bad_weights = [-0.1] * 10 + kdf.sample(n=3, weights=bad_weights) + + # Check inf and -inf throw errors: + with self.assertRaises(ValueError): + weights_with_inf = [0.1] * 10 + weights_with_inf[0] = np.inf + kdf.sample(n=3, weights=weights_with_inf) + + with self.assertRaises(ValueError): + weights_with_ninf = [0.1] * 10 + weights_with_ninf[0] = -np.inf + kdf.sample(n=3, weights=weights_with_ninf) + + # All zeros raises errors + zero_weights = [0] * 10 + with self.assertRaises(ValueError): + kdf.sample(n=3, weights=zero_weights) + + # All missing weights + nan_weights = [np.nan] * 10 + with self.assertRaises(ValueError): + kdf.sample(n=3, weights=nan_weights) + + # Check np.nan are replaced by zeros. + weights_with_nan = [np.nan] * 3 + weights_with_nan[2] = 0.5 + self.assert_eq( + kdf.sample(n=1, axis=0, weights=weights_with_nan), + pdf.sample(n=1, axis=0, weights=weights_with_nan), + ) + + # Check None are also replaced by zeros. + weights_with_None = [None] * 3 + weights_with_None[2] = 0.5 + self.assert_eq( + kdf.sample(n=1, axis=0, weights=weights_with_None), + pdf.sample(n=1, axis=0, weights=weights_with_None), + ) + + ### + # Test axis argument + ### + + # Test axis argument + pdf = pd.DataFrame({"col1": range(10), "col2": ["a"] * 10}) + kdf = ks.from_pandas(pdf) + second_column_weight = [0, 1] + + weight = [0] * 10 + weight[5] = 0.5 + self.assert_eq( + kdf.sample(n=1, axis="rows", weights=weight), + pdf.sample(n=1, axis="rows", weights=weight), + ) + self.assert_eq( + kdf.sample(n=1, axis="index", weights=weight), + pdf.sample(n=1, axis="index", weights=weight), + ) + # Check out of range axis values with self.assertRaises(ValueError): - kdf.sample() + kdf.sample(n=1, axis=2) + + with self.assertRaises(ValueError): + kdf.sample(n=1, axis="not_a_name") + + # Check for axis=1 raise NotImplementedError + with self.assertRaises(NotImplementedError): + kdf.sample(n=1, axis=1) + with self.assertRaises(NotImplementedError): - kdf.sample(n=1) + kdf.sample(n=1, axis="columns") + + # Check for frac > 1 and replace + kdf = ks.DataFrame({"A": list("abc")}) + msg = "Replace has to be set to `True` when " "upsampling the population `frac` > 1." + with self.assertRaisesRegex(ValueError, msg): + kdf.sample(frac=2, replace=False) + + # Check for frac > 1 and replace + # Make sure the tests run, but we can't check the result because they are non-deterministic. + kdf.sample(frac=2, replace=True) def test_add_prefix(self): pdf = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}, index=np.random.rand(4)) From 163442841e4eef40c0edcdafd97c0c5f82467e77 Mon Sep 17 00:00:00 2001 From: chenkai02 Date: Fri, 6 Nov 2020 13:51:22 +0000 Subject: [PATCH 2/3] Support the remaining parameters of the sample function of DataFrame, such as n, axis, weights. Now there are two unsupported situations: 1.does not support axis=1 2.If the value of the frac parameter > 1, the weights parameter is not supported --- databricks/koalas/frame.py | 154 +++++++++++++---- databricks/koalas/series.py | 11 +- databricks/koalas/tests/test_dataframe.py | 197 ++++++++++++++++++++-- 3 files changed, 322 insertions(+), 40 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index dd80728230..f6e4bea0f8 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -53,6 +53,7 @@ else: from pandas.core.dtypes.common import _get_dtype_from_object as infer_dtype_from_object from pandas.core.accessor import CachedAccessor +import pandas.core.common as com from pandas.core.dtypes.inference import is_sequence import pyspark from pyspark import StorageLevel @@ -7194,7 +7195,9 @@ def sample( n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, + weights: Optional[Any] = None, random_state: Optional[int] = None, + axis: Optional[Any] = None, ) -> "DataFrame": """ Return a random sample of items from an axis of object. @@ -7215,14 +7218,34 @@ def sample( Fraction of axis items to return. replace : bool, default False Sample with or without replacement. + weights : str or ndarray-like, optional + Default 'None' results in equal probability weighting. + If passed a Series, will align with target object on index. Index + values in weights not found in sampled object will be ignored and + index values in sampled object not in weights will be assigned + weights of zero. + If called on a DataFrame, will accept the name of a column + when axis = 0. + Unless weights are a Series, weights must be same length as axis + being sampled. + If weights do not sum to 1, they will be normalized to sum to 1. + Missing values in the weights column will be treated as zero. + Infinite values not allowed. random_state : int, optional Seed for the random number generator (if int). + axis : {0 or ‘index’, 1 or ‘columns’, None}, default None + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type (0 for Series and DataFrames). Returns ------- Series or DataFrame A new object of same type as caller containing the sampled items. + Notes + ----- + If `frac` > 1, `replacement` should be set to `True`. + Examples -------- >>> df = ks.DataFrame({'num_legs': [2, 4, 8, 0], @@ -7237,46 +7260,119 @@ def sample( spider 8 0 1 fish 0 0 8 - A random 25% sample of the ``DataFrame``. + Extract 3 random elements from the ``Series`` ``df['num_legs']``: Note that we use `random_state` to ensure the reproducibility of the examples. - >>> df.sample(frac=0.25, random_state=1) # doctest: +SKIP - num_legs num_wings num_specimen_seen - falcon 2 2 10 - fish 0 0 8 - - Extract 25% random elements from the ``Series`` ``df['num_legs']``, with replacement, - so the same items could appear more than once. - - >>> df['num_legs'].sample(frac=0.4, replace=True, random_state=1) # doctest: +SKIP + >>> df['num_legs'].sample(n=3, random_state=1).sort_index() falcon 2 - spider 8 + fish 0 spider 8 Name: num_legs, dtype: int64 - Specifying the exact number of items to return is not supported at the moment. + A random 50% sample of the ``DataFrame`` with replacement: - >>> df.sample(n=5) # doctest: +ELLIPSIS - Traceback (most recent call last): - ... - NotImplementedError: Function sample currently does not support specifying ... + >>> df.sample(frac=0.5, replace=True, random_state=1).sort_index() + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 """ - # Note: we don't run any of the doctests because the result can change depending on the - # system's core count. - if n is not None: - raise NotImplementedError( - "Function sample currently does not support specifying " - "exact number of items to return. Use frac instead." - ) + if axis in ("index", "rows", 0, None): + axis = 0 + elif axis in ("columns", 1): + raise NotImplementedError("Function sample currently does not support axis=1.") + else: + raise ValueError("No axis named %s for object type %s." % (axis, type(axis))) - if frac is None: - raise ValueError("frac must be specified.") + axis_length = self.shape[axis] - sdf = self._internal.resolved_copy.spark_frame.sample( - withReplacement=replace, fraction=frac, seed=random_state - ) - return DataFrame(self._internal.with_new_sdf(sdf)) + # Process random_state argument + if LooseVersion(pd.__version__) >= LooseVersion("0.24"): + rs = com.random_state(random_state) + else: + rs = com._random_state(random_state) + + # Check weights for compliance + if weights is not None: + + # If a series, align with frame + if isinstance(weights, ks.Series): + weights = weights.reindex(self.axes[axis]) + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, str): + if isinstance(self, ks.DataFrame): + if axis == 0: + try: + weights = self[weights] + except KeyError as err: + raise KeyError("String passed to weights not a valid column") from err + else: + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) + else: + raise ValueError( + "Strings cannot be passed as weights " "when sampling from a Series." + ) + + # Because ks.Series currently does not support the Series.__iter__ method, + # It cannot be initialized to the pandas Series, so here is to_pandas. + if isinstance(weights, ks.Series): + weights = pd.Series(weights.to_pandas(), dtype="float64") + else: + weights = pd.Series(weights, dtype="float64") + + if len(weights) != axis_length: + raise ValueError("Weights and axis to be sampled must be of same length") + + if (weights == np.inf).any() or (weights == -np.inf).any(): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + + # If has nan, set to zero. + weights = weights.fillna(0) + + # Renormalize if don't sum to 1 + if weights.sum() != 1: + if weights.sum() != 0: + weights = weights / weights.sum() + else: + raise ValueError("Invalid weights: weights sum to zero") + + weights = weights._values + + # If no frac or n, default to n=1. + if n is None and frac is None: + n = 1 + elif frac is not None and frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " "upsampling the population `frac` > 1." + ) + elif n is not None and frac is None and n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + elif n is None and frac is not None: + n = int(round(frac * axis_length)) + elif n is not None and frac is not None: + raise ValueError("Please enter a value for `frac` OR `n`, not both") + + # Check for negative sizes + if n < 0: + raise ValueError("A negative number of rows requested. Please provide positive value.") + + # Because duplicated row selection is not currently supported. + # So if frac > 1, use the pyspark implementation. + if frac is not None and frac > 1: + sdf = self._internal.resolved_copy.spark_frame.sample( + withReplacement=replace, fraction=float(frac), seed=random_state + ) + return DataFrame(self._internal.with_new_sdf(sdf)) + locs = rs.choice(axis_length, size=n, replace=replace, p=weights) + return self.take(locs, axis=axis) def astype(self, dtype) -> "DataFrame": """ diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 8f9d03abaf..232efdec4f 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -2855,10 +2855,19 @@ def sample( n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, + weights: Optional[Any] = None, random_state: Optional[int] = None, + axis: Optional[Any] = None, ) -> "Series": return first_series( - self.to_frame().sample(n=n, frac=frac, replace=replace, random_state=random_state) + self.to_frame().sample( + n=n, + frac=frac, + replace=replace, + weights=weights, + random_state=random_state, + axis=axis, + ) ).rename(self.name) sample.__doc__ = DataFrame.sample.__doc__ diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index b55b5c4265..b54f634cfe 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -2028,21 +2028,198 @@ def test_binary_operator_multiply(self): self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 0.1 * kdf["a"]) def test_sample(self): - pdf = pd.DataFrame({"A": [0, 2, 4]}) + # A few dataframe test with degenerate weights. + easy_weight_list = [0] * 10 + easy_weight_list[5] = 1 + + pdf = pd.DataFrame( + { + "col1": range(10, 20), + "col2": range(20, 30), + "colString": ["a"] * 10, + "easyweights": easy_weight_list, + } + ) kdf = ks.from_pandas(pdf) - # Make sure the tests run, but we can't check the result because they are non-deterministic. - kdf.sample(frac=0.1) - kdf.sample(frac=0.2, replace=True) - kdf.sample(frac=0.2, random_state=5) - kdf["A"].sample(frac=0.2) - kdf["A"].sample(frac=0.2, replace=True) - kdf["A"].sample(frac=0.2, random_state=5) + self.assert_eq( + kdf.sample(n=1, weights="easyweights"), pdf.sample(n=1, weights="easyweights"), + ) + + # Test that function aligns weights with frame + pdf = pd.DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) + pser = pd.Series([1, 0, 0], index=[3, 5, 9]) + + kdf = ks.from_pandas(pdf) + kser = ks.from_pandas(pser) + self.assert_eq(kdf.sample(1, weights=kser), pdf.sample(1, weights=pser)) + + # Weights have index values to be dropped because not in + # sampled DataFrame + pser2 = pd.Series([0.001, 0, 10000], index=[3, 5, 10]) + kser2 = ks.from_pandas(pser2) + self.assert_eq(kdf.sample(1, weights=kser2), pdf.sample(1, weights=pser2)) + + # Weights have empty values to be filed with zeros + pser3 = pd.Series([0.01, 0], index=[3, 5]) + kser3 = ks.from_pandas(pser3) + self.assert_eq(kdf.sample(1, weights=kser3), pdf.sample(1, weights=pser3)) + + # No overlap in weight and sampled DataFrame indices + pser4 = pd.Series([1, 0], index=[1, 2]) + kser4 = ks.from_pandas(pser4) + with self.assertRaises(ValueError): + kdf.sample(1, weights=kser4) + + ### + # Check behavior of random_state argument + ### + + # Check for stability when receives seed or random state -- run 10 + # times. + for test in range(10): + seed = np.random.randint(0, 100) + self.assert_eq(kdf.sample(n=2, random_state=seed), kdf.sample(n=2, random_state=seed)) + self.assert_eq( + kdf.sample(frac=0.7, random_state=seed), kdf.sample(frac=0.7, random_state=seed) + ) + self.assert_eq( + kdf.sample(n=2, random_state=np.random.RandomState(test)), + kdf.sample(n=2, random_state=np.random.RandomState(test)), + ) + self.assert_eq( + kdf.sample(frac=0.7, random_state=np.random.RandomState(test)), + kdf.sample(frac=0.7, random_state=np.random.RandomState(test)), + ) + + # Check for error when random_state argument invalid. + with self.assertRaises(ValueError): + kdf.sample(random_state="astring!") + + ### + # Check behavior of `frac` and `N` + ### + + # Giving both frac and N throws error + with self.assertRaises(ValueError): + kdf.sample(n=3, frac=0.3) + + # Check that raises right error for negative lengths + with self.assertRaises(ValueError): + kdf.sample(n=-3) + with self.assertRaises(ValueError): + kdf.sample(frac=-0.3) + + # Make sure float values of `n` give error + with self.assertRaises(ValueError): + kdf.sample(n=3.2) + + # Check lengths are right + assert len(kdf.sample(n=2) == 2) + assert len(kdf.sample(frac=0.34) == 1) + assert len(kdf.sample(frac=0.48) == 2) + + ### + # Check weights + ### + + # Weight length must be right + with self.assertRaises(ValueError): + kdf.sample(n=3, weights=[0, 1]) + + with self.assertRaises(ValueError): + bad_weights = [0.5] * 11 + kdf.sample(n=3, weights=bad_weights) + + with self.assertRaises(ValueError): + bad_weight_series = ks.Series([0, 0, 0.2]) + kdf.sample(n=4, weights=bad_weight_series) + + # Check won't accept negative weights + with self.assertRaises(ValueError): + bad_weights = [-0.1] * 10 + kdf.sample(n=3, weights=bad_weights) + + # Check inf and -inf throw errors: + with self.assertRaises(ValueError): + weights_with_inf = [0.1] * 10 + weights_with_inf[0] = np.inf + kdf.sample(n=3, weights=weights_with_inf) + + with self.assertRaises(ValueError): + weights_with_ninf = [0.1] * 10 + weights_with_ninf[0] = -np.inf + kdf.sample(n=3, weights=weights_with_ninf) + + # All zeros raises errors + zero_weights = [0] * 10 + with self.assertRaises(ValueError): + kdf.sample(n=3, weights=zero_weights) + + # All missing weights + nan_weights = [np.nan] * 10 + with self.assertRaises(ValueError): + kdf.sample(n=3, weights=nan_weights) + + # Check np.nan are replaced by zeros. + weights_with_nan = [np.nan] * 3 + weights_with_nan[2] = 0.5 + self.assert_eq( + kdf.sample(n=1, axis=0, weights=weights_with_nan), + pdf.sample(n=1, axis=0, weights=weights_with_nan), + ) + + # Check None are also replaced by zeros. + weights_with_None = [None] * 3 + weights_with_None[2] = 0.5 + self.assert_eq( + kdf.sample(n=1, axis=0, weights=weights_with_None), + pdf.sample(n=1, axis=0, weights=weights_with_None), + ) + + ### + # Test axis argument + ### + + # Test axis argument + pdf = pd.DataFrame({"col1": range(10), "col2": ["a"] * 10}) + kdf = ks.from_pandas(pdf) + second_column_weight = [0, 1] + + weight = [0] * 10 + weight[5] = 0.5 + self.assert_eq( + kdf.sample(n=1, axis="rows", weights=weight), + pdf.sample(n=1, axis="rows", weights=weight), + ) + self.assert_eq( + kdf.sample(n=1, axis="index", weights=weight), + pdf.sample(n=1, axis="index", weights=weight), + ) + # Check out of range axis values with self.assertRaises(ValueError): - kdf.sample() + kdf.sample(n=1, axis=2) + + with self.assertRaises(ValueError): + kdf.sample(n=1, axis="not_a_name") + + # Check for axis=1 raise NotImplementedError + with self.assertRaises(NotImplementedError): + kdf.sample(n=1, axis=1) + with self.assertRaises(NotImplementedError): - kdf.sample(n=1) + kdf.sample(n=1, axis="columns") + + # Check for frac > 1 and replace + kdf = ks.DataFrame({"A": list("abc")}) + msg = "Replace has to be set to `True` when " "upsampling the population `frac` > 1." + with self.assertRaisesRegex(ValueError, msg): + kdf.sample(frac=2, replace=False) + + # Check for frac > 1 and replace + # Make sure the tests run, but we can't check the result because they are non-deterministic. + kdf.sample(frac=2, replace=True) def test_add_prefix(self): pdf = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}, index=np.random.rand(4)) From 4394f5f76c8f78c46d74df674fc13b178672f949 Mon Sep 17 00:00:00 2001 From: chenkai02 Date: Thu, 12 Nov 2020 08:08:33 +0000 Subject: [PATCH 3/3] 1. For performance considerations, the weights parameter does not support str and series temporarily 2. Optimize part of the code based on review comments. --- databricks/koalas/frame.py | 45 +++++++------------ databricks/koalas/tests/test_dataframe.py | 54 +++-------------------- databricks/koalas/utils.py | 2 +- 3 files changed, 25 insertions(+), 76 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index e42db49c3c..76ab6511c1 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -7218,7 +7218,8 @@ def sample( Fraction of axis items to return. replace : bool, default False Sample with or without replacement. - weights : str or ndarray-like, optional + weights : ndarray-like, optional + Currently does not support Series and str. Default 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and @@ -7277,12 +7278,9 @@ def sample( dog 4 0 2 fish 0 0 8 """ - if axis in ("index", "rows", 0, None): - axis = 0 - elif axis in ("columns", 1): + axis = validate_axis(axis) + if axis == 1: raise NotImplementedError("Function sample currently does not support axis=1.") - else: - raise ValueError("No axis named %s for object type %s." % (axis, type(axis))) axis_length = self.shape[axis] @@ -7295,25 +7293,15 @@ def sample( # Check weights for compliance if weights is not None: - # If a series, align with frame - if isinstance(weights, ks.Series): - weights = weights.reindex(self.axes[axis]) - - # Strings acceptable if a dataframe and axis = 0 - if isinstance(weights, str): - if isinstance(self, ks.DataFrame): - if axis == 0: - try: - weights = self[weights] - except KeyError as err: - raise KeyError("String passed to weights not a valid column") from err - - # Because ks.Series currently does not support the Series.__iter__ method, + # If a series or str, ks.Series currently does not support the Series.__iter__ method, # It cannot be initialized to the pandas Series, so here is to_pandas. - if isinstance(weights, ks.Series): - weights = pd.Series(weights.to_pandas(), dtype="float64") - else: - weights = pd.Series(weights, dtype="float64") + # Don't support weights as Series for now since it could occur performance degradation. + if isinstance(weights, (ks.Series, str)): + raise NotImplementedError( + "The weights parameter does not currently support the Series and str." + ) + + weights = pd.Series(weights, dtype="float64") if len(weights) != axis_length: raise ValueError("Weights and axis to be sampled must be of same length") @@ -7322,15 +7310,16 @@ def sample( raise ValueError("weight vector may not include `inf` values") if (weights < 0).any(): - raise ValueError("weight vector many not include negative values") + raise ValueError("weight vector may not include negative values") # If has nan, set to zero. weights = weights.fillna(0) # Renormalize if don't sum to 1 - if weights.sum() != 1: - if weights.sum() != 0: - weights = weights / weights.sum() + weights_sum = weights.sum() + if weights_sum != 1: + if weights_sum != 0: + weights = weights / weights_sum else: raise ValueError("Invalid weights: weights sum to zero") diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 12f3b63dda..076f94b5a8 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -2028,52 +2028,8 @@ def test_binary_operator_multiply(self): self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 0.1 * kdf["a"]) def test_sample(self): - # A few dataframe test with degenerate weights. - easy_weight_list = [0] * 10 - easy_weight_list[5] = 1 - - pdf = pd.DataFrame( - { - "col1": range(10, 20), - "col2": range(20, 30), - "colString": ["a"] * 10, - "easyweights": easy_weight_list, - } - ) - kdf = ks.from_pandas(pdf) - - self.assert_eq( - kdf.sample(n=1, weights="easyweights"), pdf.sample(n=1, weights="easyweights"), - ) - - # Weights for invalid key - with self.assertRaises(KeyError): - kdf.sample(1, weights="col3") - - # Test that function aligns weights with frame pdf = pd.DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) - pser = pd.Series([1, 0, 0], index=[3, 5, 9]) - kdf = ks.from_pandas(pdf) - kser = ks.from_pandas(pser) - self.assert_eq(kdf.sample(1, weights=kser), pdf.sample(1, weights=pser)) - - # Weights have index values to be dropped because not in - # sampled DataFrame - pser2 = pd.Series([0.001, 0, 10000], index=[3, 5, 10]) - kser2 = ks.from_pandas(pser2) - self.assert_eq(kdf.sample(1, weights=kser2), pdf.sample(1, weights=pser2)) - - # Weights have empty values to be filed with zeros - pser3 = pd.Series([0.01, 0], index=[3, 5]) - kser3 = ks.from_pandas(pser3) - self.assert_eq(kdf.sample(1, weights=kser3), pdf.sample(1, weights=pser3)) - - # No overlap in weight and sampled DataFrame indices - pser4 = pd.Series([1, 0], index=[1, 2]) - kser4 = ks.from_pandas(pser4) - with self.assertRaises(ValueError): - kdf.sample(1, weights=kser4) ### # Check behavior of random_state argument @@ -2135,9 +2091,13 @@ def test_sample(self): bad_weights = [0.5] * 11 kdf.sample(n=3, weights=bad_weights) - with self.assertRaises(ValueError): - bad_weight_series = ks.Series([0, 0, 0.2]) - kdf.sample(n=4, weights=bad_weight_series) + # Weight do not support a Series or str + with self.assertRaises(NotImplementedError): + weight_series = ks.Series([0, 0.2]) + kdf.sample(n=4, weights=weight_series) + + with self.assertRaises(NotImplementedError): + kdf.sample(n=4, weights="col1") # Check won't accept negative weights with self.assertRaises(ValueError): diff --git a/databricks/koalas/utils.py b/databricks/koalas/utils.py index 4c353928e8..776aa8db8e 100644 --- a/databricks/koalas/utils.py +++ b/databricks/koalas/utils.py @@ -652,7 +652,7 @@ def is_name_like_value( def validate_axis(axis=0, none_axis=0): """ Check the given axis is valid. """ # convert to numeric axis - axis = {None: none_axis, "index": 0, "columns": 1}.get(axis, axis) + axis = {None: none_axis, "index": 0, "rows": 0, "columns": 1}.get(axis, axis) if axis not in (none_axis, 0, 1): raise ValueError("No axis named {0}".format(axis)) return axis