Skip to content

Commit

Permalink
API: Make describe changes backwards compatible (pandas-dev#34798)
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger authored and fangchenli committed Jul 16, 2020
1 parent 8b25613 commit cf1b141
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 17 deletions.
10 changes: 1 addition & 9 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ Other enhancements
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
- Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`)
- :meth:`~DataFrame.describe` now includes a ``datetime_is_numeric`` keyword to control how datetime columns are summarized (:issue:`30164`, :issue:`34798`)
- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
- :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`)
- When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`)
Expand Down Expand Up @@ -675,15 +676,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once
df.apply(func, axis=1)
.. _whatsnew_110.api.other:

Other API changes
^^^^^^^^^^^^^^^^^

- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)


Increased minimum versions for dependencies
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
54 changes: 48 additions & 6 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9711,7 +9711,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries:
return np.abs(self)

def describe(
self: FrameOrSeries, percentiles=None, include=None, exclude=None
self: FrameOrSeries,
percentiles=None,
include=None,
exclude=None,
datetime_is_numeric=False,
) -> FrameOrSeries:
"""
Generate descriptive statistics.
Expand Down Expand Up @@ -9757,6 +9761,12 @@ def describe(
``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
exclude pandas categorical columns, use ``'category'``
- None (default) : The result will exclude nothing.
datetime_is_numeric : bool, default False
Whether to treat datetime dtypes as numeric. This affects statistics
calculated for the column. For DataFrame input, this also
controls whether datetime columns are included by default.
.. versionadded:: 1.1.0
Returns
-------
Expand Down Expand Up @@ -9834,7 +9844,7 @@ def describe(
... np.datetime64("2010-01-01"),
... np.datetime64("2010-01-01")
... ])
>>> s.describe()
>>> s.describe(datetime_is_numeric=True)
count 3
mean 2006-09-01 08:00:00
min 2000-01-01 00:00:00
Expand Down Expand Up @@ -9992,8 +10002,37 @@ def describe_categorical_1d(data):
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
names += ["top", "freq"]
result += [top, freq]
if is_datetime64_any_dtype(data.dtype):
if self.ndim == 1:
stacklevel = 4
else:
stacklevel = 5
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
Expand All @@ -10019,7 +10058,7 @@ def describe_1d(data):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype):
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
Expand All @@ -10030,7 +10069,10 @@ def describe_1d(data):
return describe_1d(self)
elif (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
data = self.select_dtypes(include=[np.number])
default_include = [np.number]
if datetime_is_numeric:
default_include.append("datetime")
data = self.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = self
elif include == "all":
Expand Down
64 changes: 63 additions & 1 deletion pandas/tests/frame/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,69 @@ def test_describe_tz_values(self, tz_naive_fixture):
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
result = df.describe(include="all")
result = df.describe(include="all", datetime_is_numeric=True)
tm.assert_frame_equal(result, expected)

def test_datetime_is_numeric_includes_datetime(self):
df = pd.DataFrame({"a": pd.date_range("2012", periods=3), "b": [1, 2, 3]})
result = df.describe(datetime_is_numeric=True)
expected = pd.DataFrame(
{
"a": [
3,
pd.Timestamp("2012-01-02"),
pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-01T12:00:00"),
pd.Timestamp("2012-01-02"),
pd.Timestamp("2012-01-02T12:00:00"),
pd.Timestamp("2012-01-03"),
np.nan,
],
"b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
tm.assert_frame_equal(result, expected)

def test_describe_tz_values2(self):
tz = "CET"
s1 = Series(range(5))
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s2 = Series(date_range(start, end, tz=tz))
df = pd.DataFrame({"s1": s1, "s2": s2})

s1_ = s1.describe()
s2_ = pd.Series(
[
5,
5,
s2.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
],
index=["count", "unique", "top", "freq", "first", "last"],
)
idx = [
"count",
"unique",
"top",
"freq",
"first",
"last",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
]
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]

with tm.assert_produces_warning(FutureWarning):
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)

def test_describe_percentiles_integer_idx(self):
Expand Down
42 changes: 41 additions & 1 deletion pandas/tests/series/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_describe_with_tz(self, tz_naive_fixture):
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)
result = s.describe()
result = s.describe(datetime_is_numeric=True)
expected = Series(
[
5,
Expand All @@ -98,3 +98,43 @@ def test_describe_with_tz(self, tz_naive_fixture):
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)

def test_describe_with_tz_warns(self):
name = tz = "CET"
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)

with tm.assert_produces_warning(FutureWarning):
result = s.describe()

expected = Series(
[
5,
5,
s.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
],
name=name,
index=["count", "unique", "top", "freq", "first", "last"],
)
tm.assert_series_equal(result, expected)

def test_datetime_is_numeric_includes_datetime(self):
s = Series(date_range("2012", periods=3))
result = s.describe(datetime_is_numeric=True)
expected = Series(
[
3,
Timestamp("2012-01-02"),
Timestamp("2012-01-01"),
Timestamp("2012-01-01T12:00:00"),
Timestamp("2012-01-02"),
Timestamp("2012-01-02T12:00:00"),
Timestamp("2012-01-03"),
],
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)

0 comments on commit cf1b141

Please sign in to comment.