dask · jrbourbeau · May 16, 2023 · May 15, 2023 · May 15, 2023 · May 16, 2023
diff --git a/dask/dataframe/io/parquet/arrow.py b/dask/dataframe/io/parquet/arrow.py
@@ -1256,7 +1256,9 @@ def _create_dd_meta(cls, dataset_info):
             # Make sure all categories are set to "unknown".
             # Cannot include index names in the `cols` argument.
             meta = clear_known_categories(
-                meta, cols=[c for c in categories if c not in meta.index.names]
+                meta,
+                cols=[c for c in categories if c not in meta.index.names],
+                dtype_backend=dtype_backend,
             )
 
         if partition_obj:

diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py
@@ -4904,3 +4904,15 @@ def test_read_parquet_preserve_categorical_column_dtype(tmp_path):
         index=[0, 0],
     )
     assert_eq(ddf, expected)
+
+
+@PYARROW_MARK
+@pytest.mark.skipif(not PANDAS_GT_150, reason="Requires pd.ArrowDtype")
+def test_dtype_backend_categoricals(tmp_path):
+    df = pd.DataFrame({"a": pd.Series(["x", "y"], dtype="category"), "b": [1, 2]})
+    outdir = tmp_path / "out.parquet"
+    df.to_parquet(outdir, engine="pyarrow")
+    ddf = dd.read_parquet(outdir, engine="pyarrow", dtype_backend="pyarrow")
+    pdf = pd.read_parquet(outdir, engine="pyarrow", dtype_backend="pyarrow")
+    # Set sort_results=False because of pandas bug
+    assert_eq(ddf, pdf, sort_results=False)
diff --git a/dask/dataframe/utils.py b/dask/dataframe/utils.py
@@ -261,7 +261,7 @@ def strip_unknown_categories(x, just_drop_unknown=False):
     return x
 
 
-def clear_known_categories(x, cols=None, index=True):
+def clear_known_categories(x, cols=None, index=True, dtype_backend=None):
     """Set categories to be unknown.
 
     Parameters
@@ -273,7 +273,15 @@ def clear_known_categories(x, cols=None, index=True):
     index : bool, optional
         If True and x is a Series or DataFrame, set the clear known categories
         in the index as well.
+    dtype_backend : string, optional
+        If set to PyArrow, the categorical dtype is implemented as a PyArrow
+        dictionary
     """
+    if dtype_backend == "pyarrow":
+        # Right now Categorical with PyArrow is implemented as dictionary and
+        # categorical accessor is not yet available
+        return x
+
     if isinstance(x, (pd.Series, pd.DataFrame)):
         x = x.copy()
         if isinstance(x, pd.DataFrame):