diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 49b2349851479..115ef0b343b9c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -358,6 +358,7 @@ Reshaping - :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`) - Bug in :func:`concat` where order of ``OrderedDict`` (and ``dict`` in Python 3.6+) is not respected, when passed in as ``objs`` argument (:issue:`21510`) - Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`). +- Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`) Sparse ^^^^^^ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 57058f48d3351..8d8f8a723c97a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1666,3 +1666,68 @@ def test_merge_suffix_none_error(col1, col2, suffixes): msg = "iterable" with pytest.raises(TypeError, match=msg): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + + +@pytest.mark.parametrize("cat_dtype", ["one", "two"]) +@pytest.mark.parametrize("reverse", [True, False]) +def test_merge_equal_cat_dtypes(cat_dtype, reverse): + # see gh-22501 + cat_dtypes = { + "one": CategoricalDtype(categories=["a", "b", "c"], ordered=False), + "two": CategoricalDtype(categories=["a", "b", "c"], ordered=False), + } + + df1 = DataFrame({ + "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), + "left": [1, 2, 3], + }).set_index("foo") + + data_foo = ["a", "b", "c"] + data_right = [1, 2, 3] + + if reverse: + data_foo.reverse() + data_right.reverse() + + df2 = DataFrame({ + "foo": Series(data_foo).astype(cat_dtypes[cat_dtype]), + "right": data_right + }).set_index("foo") + + result = df1.merge(df2, left_index=True, right_index=True) + + expected = DataFrame({ + "left": [1, 2, 3], + "right": [1, 2, 3], + "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), + }).set_index("foo") + + # Categorical is unordered, so don't check ordering. + tm.assert_frame_equal(result, expected, check_categorical=False) + + +def test_merge_equal_cat_dtypes2(): + # see gh-22501 + cat_dtype = CategoricalDtype(categories=["a", "b", "c"], ordered=False) + + # Test Data + df1 = DataFrame({ + "foo": Series(["a", "b"]).astype(cat_dtype), + "left": [1, 2], + }).set_index("foo") + + df2 = DataFrame({ + "foo": Series(["a", "b", "c"]).astype(cat_dtype), + "right": [3, 2, 1], + }).set_index("foo") + + result = df1.merge(df2, left_index=True, right_index=True) + + expected = DataFrame({ + "left": [1, 2], + "right": [3, 2], + "foo": Series(["a", "b"]).astype(cat_dtype), + }).set_index("foo") + + # Categorical is unordered, so don't check ordering. + tm.assert_frame_equal(result, expected, check_categorical=False)