diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 2d30e00142846e..7d11b9f17255b6 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -7,8 +7,3 @@ This is a major release from 0.21.1 and includes a number of API changes, deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. - -.. _whatsnew_0220.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 40e1e2011479c5..01983023fa0cf6 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -119,6 +119,59 @@ Current Behavior s.rank(na_option='top') + +.. _whatsnew_0230.enhancements.assign_dependent: + +``.assign()`` accepts dependent arguments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`DataFrame.assign()` now accepts dependent kwargs. In earlier versions this throws a Keyerror exception anymore. (:issue: `14207) + +Specifically, defining a new column inside assign may be referenced in the same assign statement if a callable is used. For example + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'A': [1, 2, 3]}) + + In [4]: df.assign(B=df.A, C=lambda x:x['A']+ x['B']) + Out[4]: + A B C + 0 1 1 2 + 1 2 2 4 + 2 3 3 6 + +.. warning:: + +This may subtly change the behavior of your code when you're +using ``assign`` to update an existing column. Previously, callables +refering to other variables being updated would get the "old" values + +.. code-block:: ipython + + In [2]: df = pd.DataFrame({"A": [1, 2, 3]}) + + In [3]: df.assign(A=lambda df: df.A + 1, C=lambda df: df.A * -1) + Out[3]: + A C + 0 2 -1 + 1 3 -2 + 2 4 -3 + +Now, callables will get the "new" value + +.. code-block:: ipython + + + In [4]: df = pd.DataFrame({"A": [1, 2, 3]}) + + In [5]: df.assign(A=df.A+1, C= lambda df: df.A* -1) + Out[5]: + A C + 0 2 -2 + 1 3 -3 + 2 4 -4 + + .. _whatsnew_0230.enhancements.other: Other Enhancements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 26257f6ecbc37a..61ad0acc958273 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2659,8 +2659,11 @@ def assign(self, **kwargs): \*\*kwargs. For python 3.5 and earlier, since \*\*kwargs is unordered, the columns are inserted in alphabetical order at the end of your DataFrame. Assigning multiple columns within the same ``assign`` - is possible, but you cannot reference other columns created within - the same ``assign`` call. + is possible, but for python 3.5 and earlier, you cannot reference + other columns created within the same ``assign`` call. + For python 3.6 and above it is possible to reference columns created + in an assignment. To this end you have to respect the order of kwargs + and use callables referencing the assigned columns. Examples -------- @@ -2699,19 +2702,21 @@ def assign(self, **kwargs): """ data = self.copy() - # do all calculations first... - results = OrderedDict() - for k, v in kwargs.items(): - results[k] = com._apply_if_callable(v, data) - - # preserve order for 3.6 and later, but sort by key for 3.5 and earlier + # for 3.6 preserve order of kwargs if PY36: - results = results.items() + for k, v in kwargs.items(): + data[k] = com._apply_if_callable(v, data) else: + # for 3.5 or earlier: do all calculations first... + results = OrderedDict() + for k, v in kwargs.items(): + results[k] = com._apply_if_callable(v, data) + + # sort by key for 3.5 and earlier results = sorted(results.items()) - # ... and then assign - for k, v in results: - data[k] = v + # ... and then assign + for k, v in results: + data[k] = v return data def _sanitize_column(self, key, value, broadcast=True): diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 26e2b801f64607..8231f7eaa237ae 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -89,11 +89,35 @@ def test_assign_bad(self): df.assign(lambda x: x.A) with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C) + + @pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python + 3.6 and above""") + def test_assign_bad_old_version(self): + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + + # Key C does not exist at defition time of df with pytest.raises(KeyError): - df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C']) + df.assign(C=lambda df: df.A, + D=lambda df: df['A'] + df['C']) with pytest.raises(KeyError): df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + @pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for + python 3.5 and below""") + def test_assign_dependent(self): + df = DataFrame({'A': [1, 2], 'B': [3, 4]}) + + result = df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + + result = df.assign(C=lambda df: df.A, + D=lambda df: df['A'] + df['C']) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + def test_insert_error_msmgs(self): # GH 7432