Implementing Series.tail() & DataFrame.tail() (#1632)

As since Spark 3.0 supports a `tail()`, this PR proposes a `Series.tail()` and `DataFrame.tail()` for Koalas based on it. - Series ```python >>> kser = ks.Series([1, 2, 3, 4, 5]) >>> kser 0 1 1 2 2 3 3 4 4 5 Name: 0, dtype: int64 >>> kser.tail(3) 2 3 3 4 4 5 Name: 0, dtype: int64 ``` - DataFrame ```python >>> df = ks.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra >>> df.tail() animal 4 monkey 5 parrot 6 shark 7 whale 8 zebra >>> df.tail(3) animal 6 shark 7 whale 8 zebra >>> df.tail(-3) animal 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra ``` Resolves #343
databricks · Jul 7, 2020 · ca3d277 · ca3d277
1 parent 3c18bb4
commit ca3d277
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 2 deletions.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -70,6 +70,7 @@
     scol_for,
     validate_axis,
     verify_temp_column_name,
+    default_session,
 )
 from databricks.koalas.generic import Frame
 from databricks.koalas.internal import (
@@ -9742,6 +9743,90 @@ def calculate_columns_axis(*cols):
             )
             return first_series(DataFrame(internal))
 
+    def tail(self, n=5):
+        """
+        Return the last `n` rows.
+
+        This function returns last `n` rows from the object based on
+        position. It is useful for quickly verifying data, for example,
+        after sorting or appending rows.
+
+        For negative values of `n`, this function returns all rows except
+        the first `n` rows, equivalent to ``df[n:]``.
+
+        Parameters
+        ----------
+        n : int, default 5
+            Number of rows to select.
+
+        Returns
+        -------
+        type of caller
+            The last `n` rows of the caller object.
+
+        See Also
+        --------
+        DataFrame.head : The first `n` rows of the caller object.
+
+        Examples
+        --------
+        >>> df = ks.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
+        ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
+        >>> df
+              animal
+        0  alligator
+        1        bee
+        2     falcon
+        3       lion
+        4     monkey
+        5     parrot
+        6      shark
+        7      whale
+        8      zebra
+
+        Viewing the last 5 lines
+
+        >>> df.tail()  # doctest: +SKIP
+           animal
+        4  monkey
+        5  parrot
+        6   shark
+        7   whale
+        8   zebra
+
+        Viewing the last `n` lines (three in this case)
+
+        >>> df.tail(3)  # doctest: +SKIP
+          animal
+        6  shark
+        7  whale
+        8  zebra
+
+        For negative values of `n`
+
+        >>> df.tail(-3)  # doctest: +SKIP
+           animal
+        3    lion
+        4  monkey
+        5  parrot
+        6   shark
+        7   whale
+        8   zebra
+        """
+        if LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
+            raise RuntimeError("tail can be used in PySpark >= 3.0")
+        if not isinstance(n, int):
+            raise TypeError("bad operand type for unary -: '{}'".format(type(n).__name__))
+        if n < 0:
+            n = len(self) + n
+        if n <= 0:
+            return ks.DataFrame(self._internal.with_filter(F.lit(False)))
+        sdf = self._internal.spark_frame
+        rows = sdf.tail(n)
+        new_sdf = default_session().createDataFrame(rows, sdf.schema)
+
+        return DataFrame(self._internal.with_new_sdf(new_sdf))
+
     def _to_internal_pandas(self):
         """
         Return a pandas DataFrame directly from _internal to avoid overhead of copy.

diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -68,7 +68,6 @@ class _MissingPandasLikeDataFrame(object):
     slice_shift = _unsupported_function("slice_shift")
     swapaxes = _unsupported_function("swapaxes")
     swaplevel = _unsupported_function("swaplevel")
-    tail = _unsupported_function("tail")
     to_feather = _unsupported_function("to_feather")
     to_gbq = _unsupported_function("to_gbq")
     to_hdf = _unsupported_function("to_hdf")

diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py
@@ -64,7 +64,6 @@ class MissingPandasLikeSeries(object):
     slice_shift = _unsupported_function("slice_shift")
     swapaxes = _unsupported_function("swapaxes")
     swaplevel = _unsupported_function("swaplevel")
-    tail = _unsupported_function("tail")
     to_hdf = _unsupported_function("to_hdf")
     to_period = _unsupported_function("to_period")
     to_sql = _unsupported_function("to_sql")

diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -72,6 +72,7 @@
     validate_axis,
     validate_bool_kwarg,
     verify_temp_column_name,
+    default_session,
 )
 from databricks.koalas.datetimes import DatetimeMethods
 from databricks.koalas.spark import functions as SF
@@ -4864,6 +4865,50 @@ def items(self) -> Iterable:
         """This is an alias of ``iteritems``."""
         return self.iteritems()
 
+    def tail(self, n=5):
+        """
+        Return the last `n` rows.
+
+        This function returns last `n` rows from the object based on
+        position. It is useful for quickly verifying data, for example,
+        after sorting or appending rows.
+
+        For negative values of `n`, this function returns all rows except
+        the first `n` rows, equivalent to ``df[n:]``.
+
+        Parameters
+        ----------
+        n : int, default 5
+            Number of rows to select.
+
+        Returns
+        -------
+        type of caller
+            The last `n` rows of the caller object.
+
+        See Also
+        --------
+        DataFrame.head : The first `n` rows of the caller object.
+
+        Examples
+        --------
+        >>> kser = ks.Series([1, 2, 3, 4, 5])
+        >>> kser
+        0    1
+        1    2
+        2    3
+        3    4
+        4    5
+        Name: 0, dtype: int64
+
+        >>> kser.tail(3)  # doctest: +SKIP
+        2    3
+        3    4
+        4    5
+        Name: 0, dtype: int64
+        """
+        return first_series(self.to_frame().tail(n=n))
+
     def _cum(self, func, skipna, part_cols=()):
         # This is used to cummin, cummax, cumsum, etc.
 

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -3729,3 +3729,17 @@ def test_iteritems(self):
 
         for p_items, k_items in zip(pdf.iteritems(), kdf.iteritems()):
             self.assert_eq(repr(p_items), repr(k_items))
+
+    def test_tail(self):
+        if LooseVersion(pyspark.__version__) >= LooseVersion("3.0"):
+            pdf = pd.DataFrame(range(1000))
+            kdf = ks.from_pandas(pdf)
+
+            self.assert_eq(pdf.tail(), kdf.tail(), almost=True)
+            self.assert_eq(pdf.tail(10), kdf.tail(10), almost=True)
+            self.assert_eq(pdf.tail(-990), kdf.tail(-990), almost=True)
+            self.assert_eq(pdf.tail(0), kdf.tail(0), almost=True)
+            self.assert_eq(pdf.tail(-1001), kdf.tail(-1001), almost=True)
+            self.assert_eq(pdf.tail(1001), kdf.tail(1001), almost=True)
+            with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
+                kdf.tail("10")
diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -1806,3 +1806,17 @@ def test_iteritems(self):
 
         for p_items, k_items in zip(pser.iteritems(), kser.iteritems()):
             self.assert_eq(repr(p_items), repr(k_items))
+
+    def test_tail(self):
+        if LooseVersion(pyspark.__version__) >= LooseVersion("3.0"):
+            pser = pd.Series(range(1000), name="Koalas")
+            kser = ks.from_pandas(pser)
+
+            self.assert_eq(pser.tail(), kser.tail())
+            self.assert_eq(pser.tail(10), kser.tail(10))
+            self.assert_eq(pser.tail(-990), kser.tail(-990))
+            self.assert_eq(pser.tail(0), kser.tail(0))
+            self.assert_eq(pser.tail(1001), kser.tail(1001))
+            self.assert_eq(pser.tail(-1001), kser.tail(-1001))
+            with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
+                kser.tail("10")
diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -63,6 +63,7 @@ Indexing, iteration
    DataFrame.iterrows
    DataFrame.keys
    DataFrame.pop
+   DataFrame.tail
    DataFrame.xs
    DataFrame.get
    DataFrame.where

diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst
@@ -170,6 +170,7 @@ Reindexing / Selection / Label manipulation
    Series.reset_index
    Series.sample
    Series.take
+   Series.tail
    Series.where
    Series.mask
    Series.truncate