Skip to content

Commit

Permalink
Implementing Series.tail() & DataFrame.tail() (#1632)
Browse files Browse the repository at this point in the history
As since Spark 3.0 supports a `tail()`, this PR proposes a `Series.tail()` and `DataFrame.tail()` for Koalas based on it.


- Series
```python
>>> kser = ks.Series([1, 2, 3, 4, 5])
>>> kser
0    1
1    2
2    3
3    4
4    5
Name: 0, dtype: int64

>>> kser.tail(3)
2    3
3    4
4    5
Name: 0, dtype: int64
```

- DataFrame
```python
>>> df = ks.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
>>> df
      animal
0  alligator
1        bee
2     falcon
3       lion
4     monkey
5     parrot
6      shark
7      whale
8      zebra

>>> df.tail()
   animal
4  monkey
5  parrot
6   shark
7   whale
8   zebra

>>> df.tail(3)
  animal
6  shark
7  whale
8  zebra

>>> df.tail(-3)
   animal
3    lion
4  monkey
5  parrot
6   shark
7   whale
8   zebra
```

Resolves #343
  • Loading branch information
itholic committed Jul 7, 2020
1 parent 3c18bb4 commit ca3d277
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 2 deletions.
85 changes: 85 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
scol_for,
validate_axis,
verify_temp_column_name,
default_session,
)
from databricks.koalas.generic import Frame
from databricks.koalas.internal import (
Expand Down Expand Up @@ -9742,6 +9743,90 @@ def calculate_columns_axis(*cols):
)
return first_series(DataFrame(internal))

def tail(self, n=5):
"""
Return the last `n` rows.
This function returns last `n` rows from the object based on
position. It is useful for quickly verifying data, for example,
after sorting or appending rows.
For negative values of `n`, this function returns all rows except
the first `n` rows, equivalent to ``df[n:]``.
Parameters
----------
n : int, default 5
Number of rows to select.
Returns
-------
type of caller
The last `n` rows of the caller object.
See Also
--------
DataFrame.head : The first `n` rows of the caller object.
Examples
--------
>>> df = ks.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
>>> df
animal
0 alligator
1 bee
2 falcon
3 lion
4 monkey
5 parrot
6 shark
7 whale
8 zebra
Viewing the last 5 lines
>>> df.tail() # doctest: +SKIP
animal
4 monkey
5 parrot
6 shark
7 whale
8 zebra
Viewing the last `n` lines (three in this case)
>>> df.tail(3) # doctest: +SKIP
animal
6 shark
7 whale
8 zebra
For negative values of `n`
>>> df.tail(-3) # doctest: +SKIP
animal
3 lion
4 monkey
5 parrot
6 shark
7 whale
8 zebra
"""
if LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
raise RuntimeError("tail can be used in PySpark >= 3.0")
if not isinstance(n, int):
raise TypeError("bad operand type for unary -: '{}'".format(type(n).__name__))
if n < 0:
n = len(self) + n
if n <= 0:
return ks.DataFrame(self._internal.with_filter(F.lit(False)))
sdf = self._internal.spark_frame
rows = sdf.tail(n)
new_sdf = default_session().createDataFrame(rows, sdf.schema)

return DataFrame(self._internal.with_new_sdf(new_sdf))

def _to_internal_pandas(self):
"""
Return a pandas DataFrame directly from _internal to avoid overhead of copy.
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ class _MissingPandasLikeDataFrame(object):
slice_shift = _unsupported_function("slice_shift")
swapaxes = _unsupported_function("swapaxes")
swaplevel = _unsupported_function("swaplevel")
tail = _unsupported_function("tail")
to_feather = _unsupported_function("to_feather")
to_gbq = _unsupported_function("to_gbq")
to_hdf = _unsupported_function("to_hdf")
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ class MissingPandasLikeSeries(object):
slice_shift = _unsupported_function("slice_shift")
swapaxes = _unsupported_function("swapaxes")
swaplevel = _unsupported_function("swaplevel")
tail = _unsupported_function("tail")
to_hdf = _unsupported_function("to_hdf")
to_period = _unsupported_function("to_period")
to_sql = _unsupported_function("to_sql")
Expand Down
45 changes: 45 additions & 0 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
validate_axis,
validate_bool_kwarg,
verify_temp_column_name,
default_session,
)
from databricks.koalas.datetimes import DatetimeMethods
from databricks.koalas.spark import functions as SF
Expand Down Expand Up @@ -4864,6 +4865,50 @@ def items(self) -> Iterable:
"""This is an alias of ``iteritems``."""
return self.iteritems()

def tail(self, n=5):
"""
Return the last `n` rows.
This function returns last `n` rows from the object based on
position. It is useful for quickly verifying data, for example,
after sorting or appending rows.
For negative values of `n`, this function returns all rows except
the first `n` rows, equivalent to ``df[n:]``.
Parameters
----------
n : int, default 5
Number of rows to select.
Returns
-------
type of caller
The last `n` rows of the caller object.
See Also
--------
DataFrame.head : The first `n` rows of the caller object.
Examples
--------
>>> kser = ks.Series([1, 2, 3, 4, 5])
>>> kser
0 1
1 2
2 3
3 4
4 5
Name: 0, dtype: int64
>>> kser.tail(3) # doctest: +SKIP
2 3
3 4
4 5
Name: 0, dtype: int64
"""
return first_series(self.to_frame().tail(n=n))

def _cum(self, func, skipna, part_cols=()):
# This is used to cummin, cummax, cumsum, etc.

Expand Down
14 changes: 14 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3729,3 +3729,17 @@ def test_iteritems(self):

for p_items, k_items in zip(pdf.iteritems(), kdf.iteritems()):
self.assert_eq(repr(p_items), repr(k_items))

def test_tail(self):
if LooseVersion(pyspark.__version__) >= LooseVersion("3.0"):
pdf = pd.DataFrame(range(1000))
kdf = ks.from_pandas(pdf)

self.assert_eq(pdf.tail(), kdf.tail(), almost=True)
self.assert_eq(pdf.tail(10), kdf.tail(10), almost=True)
self.assert_eq(pdf.tail(-990), kdf.tail(-990), almost=True)
self.assert_eq(pdf.tail(0), kdf.tail(0), almost=True)
self.assert_eq(pdf.tail(-1001), kdf.tail(-1001), almost=True)
self.assert_eq(pdf.tail(1001), kdf.tail(1001), almost=True)
with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
kdf.tail("10")
14 changes: 14 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1806,3 +1806,17 @@ def test_iteritems(self):

for p_items, k_items in zip(pser.iteritems(), kser.iteritems()):
self.assert_eq(repr(p_items), repr(k_items))

def test_tail(self):
if LooseVersion(pyspark.__version__) >= LooseVersion("3.0"):
pser = pd.Series(range(1000), name="Koalas")
kser = ks.from_pandas(pser)

self.assert_eq(pser.tail(), kser.tail())
self.assert_eq(pser.tail(10), kser.tail(10))
self.assert_eq(pser.tail(-990), kser.tail(-990))
self.assert_eq(pser.tail(0), kser.tail(0))
self.assert_eq(pser.tail(1001), kser.tail(1001))
self.assert_eq(pser.tail(-1001), kser.tail(-1001))
with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
kser.tail("10")
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ Indexing, iteration
DataFrame.iterrows
DataFrame.keys
DataFrame.pop
DataFrame.tail
DataFrame.xs
DataFrame.get
DataFrame.where
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ Reindexing / Selection / Label manipulation
Series.reset_index
Series.sample
Series.take
Series.tail
Series.where
Series.mask
Series.truncate
Expand Down

0 comments on commit ca3d277

Please sign in to comment.