Merge pull request #21 from data-apis/polars-arrow-support

Arrow support
data-apis · May 16, 2023 · 3b2da6d · 3b2da6d
2 parents 3b2ef43 + 0c74fcb
commit 3b2da6d
Show file tree

Hide file tree

Showing 7 changed files with 239 additions and 34 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -12,7 +12,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: 3.8
+        python-version: "3.10"
     - name: Upgrade pip
       run: |
         pip install pip --upgrade
@@ -23,7 +23,8 @@ jobs:
       run: |
         pip install ray git+https://github.com/modin-project/modin
         pip install vaex  # use stable as no nightly builds and long build time
+        pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --prefer-binary --pre pyarrow --force-reinstall
         pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple pandas --ignore-installed --no-deps
     - name: Run tests
       run: |
-        pytest tests/ -v --ci
+        pytest tests/ -vv --ci
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -43,18 +43,13 @@ def pytest_configure(config):
     "test_signatures.py::test_buffer_method[cudf-__dlpack__]",
     "test_signatures.py::test_buffer_method[cudf-__dlpack_device__]",
     # https://github.com/vaexio/vaex/issues/2083
-    # https://github.com/vaexio/vaex/issues/2093
-    # https://github.com/vaexio/vaex/issues/2113
-    "test_from_dataframe.py::test_from_dataframe_roundtrip[modin-vaex]",
     "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pandas]",
+    # https://github.com/modin-project/modin/issues/6143
+    # https://github.com/data-apis/dataframe-interchange-tests/pull/21#issuecomment-1495914398
+    "test_from_dataframe.py::test_from_dataframe_roundtrip[pyarrow.Table-vaex]",
+    "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pyarrow.Table]",
     # https://github.com/rapidsai/cudf/issues/11389
     "test_column_object.py::test_dtype[cudf]",
-    # Raises RuntimeError, which is technically correct, but the spec will
-    # require TypeError soon.
-    # See https://github.com/data-apis/dataframe-api/pull/74
-    "test_column_object.py::test_describe_categorical[modin]",
-    # https://github.com/vaexio/vaex/issues/2113
-    "test_column_object.py::test_describe_categorical[vaex]",
     # https://github.com/modin-project/modin/issues/4687
     "test_column_object.py::test_null_count[modin]",
     # https://github.com/vaexio/vaex/issues/2121
@@ -68,9 +63,14 @@ def pytest_configure(config):
     "test_column_object.py::test_dtype[vaex]",
     # SEGFAULT
     "test_from_dataframe.py::test_from_dataframe_roundtrip[pandas-vaex]",
-    # modin flakiness
+    # modin flakiness - probably from monkeypatching done in wrappers.py
+    "test_from_dataframe.py::test_from_dataframe_roundtrip[pandas-modin]",
     "test_from_dataframe.py::test_from_dataframe_roundtrip[modin-pandas]",
     "test_from_dataframe.py::test_from_dataframe_roundtrip[modin-modin]",
+    "test_from_dataframe.py::test_from_dataframe_roundtrip[modin-vaex]",
+    "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-modin]",
+    "test_from_dataframe.py::test_from_dataframe_roundtrip[modin-pyarrow.Table]",
+    "test_from_dataframe.py::test_from_dataframe_roundtrip[pyarrow.Table-modin]",
     "test_meta.py::test_frame_equal[modin]",
 ]
 assert not any(case in ci_xfail_ids for case in ci_skip_ids)  # sanity check

diff --git a/tests/test_column_object.py b/tests/test_column_object.py
@@ -158,8 +158,16 @@ def test_null_count(libinfo: LibraryInfo, data: st.DataObject):
     null_count = col.null_count
     if null_count is not None:
         assert isinstance(null_count, int)
-        if mock_col.nominal_dtype != NominalDtype.UTF8:  # TODO: test string cols
-            assert null_count == sum(np.isnan(mock_col.array))
+        if mock_col.nominal_dtype == NominalDtype.UTF8:  # TODO: test string cols
+            return
+        nullinfo = col.describe_null
+        assert isinstance(nullinfo, tuple) and len(nullinfo) == 2  # sanity check
+        kind, value = nullinfo
+        nan_count = sum(np.isnan(mock_col.array))
+        if kind == 0:  # non-nullable
+            assert null_count in [0, nan_count]  # XXX: should null_count always be 0?
+        else:
+            assert null_count == nan_count
 
 
 @given(data=st.data())

diff --git a/tests/test_dataframe_object.py b/tests/test_dataframe_object.py
@@ -119,11 +119,16 @@ def test_get_chunks(libinfo: LibraryInfo, data: st.DataObject):
     df = data.draw(libinfo.interchange_dataframes(), label="df")
     _n_chunks = df.num_chunks()
     assert isinstance(_n_chunks, int)  # sanity check
-    n_chunks = data.draw(
-        st.none() | st.integers(1, 2).map(lambda n: n * _n_chunks), label="n_chunks"
-    )
-    if n_chunks is None and not data.draw(st.booleans(), label="pass n_chunks"):
-        args = []
+    if _n_chunks == 0:
+        df.get_chunks()
     else:
-        args = [n_chunks]
-    df.get_chunks(*args)
+        assert _n_chunks >= 1  # sanity check
+        n_chunks_strat = st.sampled_from([None, 1])
+        if _n_chunks > 1:
+            n_chunks_strat |= st.integers(1, 2).map(lambda n: n * _n_chunks)
+        n_chunks = data.draw(n_chunks_strat, label="n_chunks")
+        if n_chunks is None and not data.draw(st.booleans(), label="pass n_chunks"):
+            args = []
+        else:
+            args = [n_chunks]
+        df.get_chunks(*args)
diff --git a/tests/test_meta.py b/tests/test_meta.py
@@ -13,7 +13,13 @@
 def test_ci_has_correct_library_params(pytestconfig):
     if not pytestconfig.getoption("--ci"):
         pytest.skip("only intended for --ci runs")
-    assert set(libname_to_libinfo.keys()) == {"pandas", "vaex", "modin"}
+    assert set(libname_to_libinfo.keys()) == {
+        "pandas",
+        "vaex",
+        "modin",
+        "pyarrow.Table",
+        "pyarrow.RecordBatch",
+    }
 
 
 @given(utf8_strings())
@@ -50,3 +56,45 @@ def test_strategy(libinfo: LibraryInfo, func_name: str, data: st.DataObject):
 def test_frame_equal(libinfo: LibraryInfo, data: st.DataObject):
     df = data.draw(libinfo.toplevel_dataframes(), label="df")
     assert libinfo.frame_equal(df, df)
+
+
+def test_pandas_frame_equal_string_object_columns():
+    try:
+        import pandas as pd
+
+        libinfo = libname_to_libinfo["pandas"]
+    except (KeyError, ImportError) as e:
+        pytest.skip(e.msg)
+    df1 = pd.DataFrame({"foo": ["bar"]})
+    assert df1["foo"].dtype == object  # sanity check
+    df2 = pd.DataFrame({"foo": pd.Series(["bar"], dtype=pd.StringDtype())})
+    assert libinfo.frame_equal(df1, df2)
+    assert libinfo.frame_equal(df2, df1)
+
+
+@pytest.mark.parametrize("container_name", ["Table", "RecordBatch"])
+def test_pyarrow_frame_equal_string_columns(container_name):
+    try:
+        import pyarrow as pa
+
+        libinfo = libname_to_libinfo[f"pyarrow.{container_name}"]
+    except (KeyError, ImportError) as e:
+        pytest.skip(e.msg)
+
+    container_class = getattr(pa, container_name)
+    df1 = container_class.from_pydict(
+        {
+            "a": pa.array(["foo"]),
+            "b": pa.DictionaryArray.from_arrays(pa.array([0]), pa.array(["bar"])),
+        }
+    )
+    df2 = container_class.from_pydict(
+        {
+            "a": pa.array(["foo"], type=pa.large_string()),
+            "b": pa.DictionaryArray.from_arrays(
+                pa.array([0]), pa.array(["bar"], type=pa.large_string())
+            ),
+        }
+    )
+    assert libinfo.frame_equal(df1, df2)
+    assert libinfo.frame_equal(df2, df1)
diff --git a/tests/test_signatures.py b/tests/test_signatures.py
@@ -20,7 +20,10 @@
 
 
 def _test_signature(func, stub):
-    sig = signature(func)
+    try:
+        sig = signature(func)
+    except ValueError:
+        pytest.skip("Signature not inspectable")
     stub_sig = signature(stub)
     params = list(sig.parameters.values())
     df_stub_params = list(stub_sig.parameters.values())