API: Make describe changes backwards compatible (pandas-dev#34798)

fangchenli · Jul 16, 2020 · cf1b141 · cf1b141
1 parent 8b25613
commit cf1b141
Show file tree

Hide file tree

Showing 4 changed files with 153 additions and 17 deletions.
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -280,6 +280,7 @@ Other enhancements
 - Added :meth:`DataFrame.value_counts` (:issue:`5377`)
 - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
 - Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`)
+- :meth:`~DataFrame.describe` now includes a ``datetime_is_numeric`` keyword to control how datetime columns are summarized (:issue:`30164`, :issue:`34798`)
 - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
 - :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`)
 - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`)
@@ -675,15 +676,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once
 
     df.apply(func, axis=1)
 
-.. _whatsnew_110.api.other:
-
-Other API changes
-^^^^^^^^^^^^^^^^^
-
-- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
-  will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
-
-
 Increased minimum versions for dependencies
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -9711,7 +9711,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries:
         return np.abs(self)
 
     def describe(
-        self: FrameOrSeries, percentiles=None, include=None, exclude=None
+        self: FrameOrSeries,
+        percentiles=None,
+        include=None,
+        exclude=None,
+        datetime_is_numeric=False,
     ) -> FrameOrSeries:
         """
         Generate descriptive statistics.
@@ -9757,6 +9761,12 @@ def describe(
               ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
               exclude pandas categorical columns, use ``'category'``
             - None (default) : The result will exclude nothing.
+        datetime_is_numeric : bool, default False
+            Whether to treat datetime dtypes as numeric. This affects statistics
+            calculated for the column. For DataFrame input, this also
+            controls whether datetime columns are included by default.
+
+            .. versionadded:: 1.1.0
 
         Returns
         -------
@@ -9834,7 +9844,7 @@ def describe(
         ...   np.datetime64("2010-01-01"),
         ...   np.datetime64("2010-01-01")
         ... ])
-        >>> s.describe()
+        >>> s.describe(datetime_is_numeric=True)
         count                      3
         mean     2006-09-01 08:00:00
         min      2000-01-01 00:00:00
@@ -9992,8 +10002,37 @@ def describe_categorical_1d(data):
             dtype = None
             if result[1] > 0:
                 top, freq = objcounts.index[0], objcounts.iloc[0]
-                names += ["top", "freq"]
-                result += [top, freq]
+                if is_datetime64_any_dtype(data.dtype):
+                    if self.ndim == 1:
+                        stacklevel = 4
+                    else:
+                        stacklevel = 5
+                    warnings.warn(
+                        "Treating datetime data as categorical rather than numeric in "
+                        "`.describe` is deprecated and will be removed in a future "
+                        "version of pandas. Specify `datetime_is_numeric=True` to "
+                        "silence this warning and adopt the future behavior now.",
+                        FutureWarning,
+                        stacklevel=stacklevel,
+                    )
+                    tz = data.dt.tz
+                    asint = data.dropna().values.view("i8")
+                    top = Timestamp(top)
+                    if top.tzinfo is not None and tz is not None:
+                        # Don't tz_localize(None) if key is already tz-aware
+                        top = top.tz_convert(tz)
+                    else:
+                        top = top.tz_localize(tz)
+                    names += ["top", "freq", "first", "last"]
+                    result += [
+                        top,
+                        freq,
+                        Timestamp(asint.min(), tz=tz),
+                        Timestamp(asint.max(), tz=tz),
+                    ]
+                else:
+                    names += ["top", "freq"]
+                    result += [top, freq]
 
             # If the DataFrame is empty, set 'top' and 'freq' to None
             # to maintain output shape consistency
@@ -10019,7 +10058,7 @@ def describe_1d(data):
                 return describe_categorical_1d(data)
             elif is_numeric_dtype(data):
                 return describe_numeric_1d(data)
-            elif is_datetime64_any_dtype(data.dtype):
+            elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
                 return describe_timestamp_1d(data)
             elif is_timedelta64_dtype(data.dtype):
                 return describe_numeric_1d(data)
@@ -10030,7 +10069,10 @@ def describe_1d(data):
             return describe_1d(self)
         elif (include is None) and (exclude is None):
             # when some numerics are found, keep only numerics
-            data = self.select_dtypes(include=[np.number])
+            default_include = [np.number]
+            if datetime_is_numeric:
+                default_include.append("datetime")
+            data = self.select_dtypes(include=default_include)
             if len(data.columns) == 0:
                 data = self
         elif include == "all":

diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
@@ -267,7 +267,69 @@ def test_describe_tz_values(self, tz_naive_fixture):
             },
             index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
         )
-        result = df.describe(include="all")
+        result = df.describe(include="all", datetime_is_numeric=True)
+        tm.assert_frame_equal(result, expected)
+
+    def test_datetime_is_numeric_includes_datetime(self):
+        df = pd.DataFrame({"a": pd.date_range("2012", periods=3), "b": [1, 2, 3]})
+        result = df.describe(datetime_is_numeric=True)
+        expected = pd.DataFrame(
+            {
+                "a": [
+                    3,
+                    pd.Timestamp("2012-01-02"),
+                    pd.Timestamp("2012-01-01"),
+                    pd.Timestamp("2012-01-01T12:00:00"),
+                    pd.Timestamp("2012-01-02"),
+                    pd.Timestamp("2012-01-02T12:00:00"),
+                    pd.Timestamp("2012-01-03"),
+                    np.nan,
+                ],
+                "b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
+            },
+            index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_describe_tz_values2(self):
+        tz = "CET"
+        s1 = Series(range(5))
+        start = Timestamp(2018, 1, 1)
+        end = Timestamp(2018, 1, 5)
+        s2 = Series(date_range(start, end, tz=tz))
+        df = pd.DataFrame({"s1": s1, "s2": s2})
+
+        s1_ = s1.describe()
+        s2_ = pd.Series(
+            [
+                5,
+                5,
+                s2.value_counts().index[0],
+                1,
+                start.tz_localize(tz),
+                end.tz_localize(tz),
+            ],
+            index=["count", "unique", "top", "freq", "first", "last"],
+        )
+        idx = [
+            "count",
+            "unique",
+            "top",
+            "freq",
+            "first",
+            "last",
+            "mean",
+            "std",
+            "min",
+            "25%",
+            "50%",
+            "75%",
+            "max",
+        ]
+        expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]
+
+        with tm.assert_produces_warning(FutureWarning):
+            result = df.describe(include="all")
         tm.assert_frame_equal(result, expected)
 
     def test_describe_percentiles_integer_idx(self):

diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py
@@ -83,7 +83,7 @@ def test_describe_with_tz(self, tz_naive_fixture):
         start = Timestamp(2018, 1, 1)
         end = Timestamp(2018, 1, 5)
         s = Series(date_range(start, end, tz=tz), name=name)
-        result = s.describe()
+        result = s.describe(datetime_is_numeric=True)
         expected = Series(
             [
                 5,
@@ -98,3 +98,43 @@ def test_describe_with_tz(self, tz_naive_fixture):
             index=["count", "mean", "min", "25%", "50%", "75%", "max"],
         )
         tm.assert_series_equal(result, expected)
+
+    def test_describe_with_tz_warns(self):
+        name = tz = "CET"
+        start = Timestamp(2018, 1, 1)
+        end = Timestamp(2018, 1, 5)
+        s = Series(date_range(start, end, tz=tz), name=name)
+
+        with tm.assert_produces_warning(FutureWarning):
+            result = s.describe()
+
+        expected = Series(
+            [
+                5,
+                5,
+                s.value_counts().index[0],
+                1,
+                start.tz_localize(tz),
+                end.tz_localize(tz),
+            ],
+            name=name,
+            index=["count", "unique", "top", "freq", "first", "last"],
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_datetime_is_numeric_includes_datetime(self):
+        s = Series(date_range("2012", periods=3))
+        result = s.describe(datetime_is_numeric=True)
+        expected = Series(
+            [
+                3,
+                Timestamp("2012-01-02"),
+                Timestamp("2012-01-01"),
+                Timestamp("2012-01-01T12:00:00"),
+                Timestamp("2012-01-02"),
+                Timestamp("2012-01-02T12:00:00"),
+                Timestamp("2012-01-03"),
+            ],
+            index=["count", "mean", "min", "25%", "50%", "75%", "max"],
+        )
+        tm.assert_series_equal(result, expected)