diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index cbe125759c..9e93c8ee65 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -70,6 +70,7 @@ scol_for, validate_axis, verify_temp_column_name, + default_session, ) from databricks.koalas.generic import Frame from databricks.koalas.internal import ( @@ -9742,6 +9743,90 @@ def calculate_columns_axis(*cols): ) return first_series(DataFrame(internal)) + def tail(self, n=5): + """ + Return the last `n` rows. + + This function returns last `n` rows from the object based on + position. It is useful for quickly verifying data, for example, + after sorting or appending rows. + + For negative values of `n`, this function returns all rows except + the first `n` rows, equivalent to ``df[n:]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + type of caller + The last `n` rows of the caller object. + + See Also + -------- + DataFrame.head : The first `n` rows of the caller object. + + Examples + -------- + >>> df = ks.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', + ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the last 5 lines + + >>> df.tail() # doctest: +SKIP + animal + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the last `n` lines (three in this case) + + >>> df.tail(3) # doctest: +SKIP + animal + 6 shark + 7 whale + 8 zebra + + For negative values of `n` + + >>> df.tail(-3) # doctest: +SKIP + animal + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + """ + if LooseVersion(pyspark.__version__) < LooseVersion("3.0"): + raise RuntimeError("tail can be used in PySpark >= 3.0") + if not isinstance(n, int): + raise TypeError("bad operand type for unary -: '{}'".format(type(n).__name__)) + if n < 0: + n = len(self) + n + if n <= 0: + return ks.DataFrame(self._internal.with_filter(F.lit(False))) + sdf = self._internal.spark_frame + rows = sdf.tail(n) + new_sdf = default_session().createDataFrame(rows, sdf.schema) + + return DataFrame(self._internal.with_new_sdf(new_sdf)) + def _to_internal_pandas(self): """ Return a pandas DataFrame directly from _internal to avoid overhead of copy. diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py index abc61d0404..8ffb3c40a2 100644 --- a/databricks/koalas/missing/frame.py +++ b/databricks/koalas/missing/frame.py @@ -68,7 +68,6 @@ class _MissingPandasLikeDataFrame(object): slice_shift = _unsupported_function("slice_shift") swapaxes = _unsupported_function("swapaxes") swaplevel = _unsupported_function("swaplevel") - tail = _unsupported_function("tail") to_feather = _unsupported_function("to_feather") to_gbq = _unsupported_function("to_gbq") to_hdf = _unsupported_function("to_hdf") diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py index dc8f91ba4c..b482d210b9 100644 --- a/databricks/koalas/missing/series.py +++ b/databricks/koalas/missing/series.py @@ -64,7 +64,6 @@ class MissingPandasLikeSeries(object): slice_shift = _unsupported_function("slice_shift") swapaxes = _unsupported_function("swapaxes") swaplevel = _unsupported_function("swaplevel") - tail = _unsupported_function("tail") to_hdf = _unsupported_function("to_hdf") to_period = _unsupported_function("to_period") to_sql = _unsupported_function("to_sql") diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index ec9854bf24..62b995bf07 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -72,6 +72,7 @@ validate_axis, validate_bool_kwarg, verify_temp_column_name, + default_session, ) from databricks.koalas.datetimes import DatetimeMethods from databricks.koalas.spark import functions as SF @@ -4864,6 +4865,50 @@ def items(self) -> Iterable: """This is an alias of ``iteritems``.""" return self.iteritems() + def tail(self, n=5): + """ + Return the last `n` rows. + + This function returns last `n` rows from the object based on + position. It is useful for quickly verifying data, for example, + after sorting or appending rows. + + For negative values of `n`, this function returns all rows except + the first `n` rows, equivalent to ``df[n:]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + type of caller + The last `n` rows of the caller object. + + See Also + -------- + DataFrame.head : The first `n` rows of the caller object. + + Examples + -------- + >>> kser = ks.Series([1, 2, 3, 4, 5]) + >>> kser + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + Name: 0, dtype: int64 + + >>> kser.tail(3) # doctest: +SKIP + 2 3 + 3 4 + 4 5 + Name: 0, dtype: int64 + """ + return first_series(self.to_frame().tail(n=n)) + def _cum(self, func, skipna, part_cols=()): # This is used to cummin, cummax, cumsum, etc. diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 466b43fa62..5ebd1e9fa9 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -3729,3 +3729,17 @@ def test_iteritems(self): for p_items, k_items in zip(pdf.iteritems(), kdf.iteritems()): self.assert_eq(repr(p_items), repr(k_items)) + + def test_tail(self): + if LooseVersion(pyspark.__version__) >= LooseVersion("3.0"): + pdf = pd.DataFrame(range(1000)) + kdf = ks.from_pandas(pdf) + + self.assert_eq(pdf.tail(), kdf.tail(), almost=True) + self.assert_eq(pdf.tail(10), kdf.tail(10), almost=True) + self.assert_eq(pdf.tail(-990), kdf.tail(-990), almost=True) + self.assert_eq(pdf.tail(0), kdf.tail(0), almost=True) + self.assert_eq(pdf.tail(-1001), kdf.tail(-1001), almost=True) + self.assert_eq(pdf.tail(1001), kdf.tail(1001), almost=True) + with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"): + kdf.tail("10") diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py index 0495aa1224..76197300ad 100644 --- a/databricks/koalas/tests/test_series.py +++ b/databricks/koalas/tests/test_series.py @@ -1806,3 +1806,17 @@ def test_iteritems(self): for p_items, k_items in zip(pser.iteritems(), kser.iteritems()): self.assert_eq(repr(p_items), repr(k_items)) + + def test_tail(self): + if LooseVersion(pyspark.__version__) >= LooseVersion("3.0"): + pser = pd.Series(range(1000), name="Koalas") + kser = ks.from_pandas(pser) + + self.assert_eq(pser.tail(), kser.tail()) + self.assert_eq(pser.tail(10), kser.tail(10)) + self.assert_eq(pser.tail(-990), kser.tail(-990)) + self.assert_eq(pser.tail(0), kser.tail(0)) + self.assert_eq(pser.tail(1001), kser.tail(1001)) + self.assert_eq(pser.tail(-1001), kser.tail(-1001)) + with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"): + kser.tail("10") diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst index 94809acc6b..e061dc6fa1 100644 --- a/docs/source/reference/frame.rst +++ b/docs/source/reference/frame.rst @@ -63,6 +63,7 @@ Indexing, iteration DataFrame.iterrows DataFrame.keys DataFrame.pop + DataFrame.tail DataFrame.xs DataFrame.get DataFrame.where diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst index ecba4e6dad..73fb72caef 100644 --- a/docs/source/reference/series.rst +++ b/docs/source/reference/series.rst @@ -170,6 +170,7 @@ Reindexing / Selection / Label manipulation Series.reset_index Series.sample Series.take + Series.tail Series.where Series.mask Series.truncate