Skip to content

Commit

Permalink
Merge pull request #21 from data-apis/polars-arrow-support
Browse files Browse the repository at this point in the history
Arrow support
  • Loading branch information
honno committed May 16, 2023
2 parents 3b2ef43 + 0c74fcb commit 3b2da6d
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 34 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: "3.10"
- name: Upgrade pip
run: |
pip install pip --upgrade
Expand All @@ -23,7 +23,8 @@ jobs:
run: |
pip install ray git+https://github.com/modin-project/modin
pip install vaex # use stable as no nightly builds and long build time
pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --prefer-binary --pre pyarrow --force-reinstall
pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple pandas --ignore-installed --no-deps
- name: Run tests
run: |
pytest tests/ -v --ci
pytest tests/ -vv --ci
20 changes: 10 additions & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,13 @@ def pytest_configure(config):
"test_signatures.py::test_buffer_method[cudf-__dlpack__]",
"test_signatures.py::test_buffer_method[cudf-__dlpack_device__]",
# https://github.com/vaexio/vaex/issues/2083
# https://github.com/vaexio/vaex/issues/2093
# https://github.com/vaexio/vaex/issues/2113
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-vaex]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pandas]",
# https://github.com/modin-project/modin/issues/6143
# https://github.com/data-apis/dataframe-interchange-tests/pull/21#issuecomment-1495914398
"test_from_dataframe.py::test_from_dataframe_roundtrip[pyarrow.Table-vaex]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pyarrow.Table]",
# https://github.com/rapidsai/cudf/issues/11389
"test_column_object.py::test_dtype[cudf]",
# Raises RuntimeError, which is technically correct, but the spec will
# require TypeError soon.
# See https://github.com/data-apis/dataframe-api/pull/74
"test_column_object.py::test_describe_categorical[modin]",
# https://github.com/vaexio/vaex/issues/2113
"test_column_object.py::test_describe_categorical[vaex]",
# https://github.com/modin-project/modin/issues/4687
"test_column_object.py::test_null_count[modin]",
# https://github.com/vaexio/vaex/issues/2121
Expand All @@ -68,9 +63,14 @@ def pytest_configure(config):
"test_column_object.py::test_dtype[vaex]",
# SEGFAULT
"test_from_dataframe.py::test_from_dataframe_roundtrip[pandas-vaex]",
# modin flakiness
# modin flakiness - probably from monkeypatching done in wrappers.py
"test_from_dataframe.py::test_from_dataframe_roundtrip[pandas-modin]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-pandas]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-modin]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-vaex]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-modin]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-pyarrow.Table]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[pyarrow.Table-modin]",
"test_meta.py::test_frame_equal[modin]",
]
assert not any(case in ci_xfail_ids for case in ci_skip_ids) # sanity check
Expand Down
12 changes: 10 additions & 2 deletions tests/test_column_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,16 @@ def test_null_count(libinfo: LibraryInfo, data: st.DataObject):
null_count = col.null_count
if null_count is not None:
assert isinstance(null_count, int)
if mock_col.nominal_dtype != NominalDtype.UTF8: # TODO: test string cols
assert null_count == sum(np.isnan(mock_col.array))
if mock_col.nominal_dtype == NominalDtype.UTF8: # TODO: test string cols
return
nullinfo = col.describe_null
assert isinstance(nullinfo, tuple) and len(nullinfo) == 2 # sanity check
kind, value = nullinfo
nan_count = sum(np.isnan(mock_col.array))
if kind == 0: # non-nullable
assert null_count in [0, nan_count] # XXX: should null_count always be 0?
else:
assert null_count == nan_count


@given(data=st.data())
Expand Down
19 changes: 12 additions & 7 deletions tests/test_dataframe_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,16 @@ def test_get_chunks(libinfo: LibraryInfo, data: st.DataObject):
df = data.draw(libinfo.interchange_dataframes(), label="df")
_n_chunks = df.num_chunks()
assert isinstance(_n_chunks, int) # sanity check
n_chunks = data.draw(
st.none() | st.integers(1, 2).map(lambda n: n * _n_chunks), label="n_chunks"
)
if n_chunks is None and not data.draw(st.booleans(), label="pass n_chunks"):
args = []
if _n_chunks == 0:
df.get_chunks()
else:
args = [n_chunks]
df.get_chunks(*args)
assert _n_chunks >= 1 # sanity check
n_chunks_strat = st.sampled_from([None, 1])
if _n_chunks > 1:
n_chunks_strat |= st.integers(1, 2).map(lambda n: n * _n_chunks)
n_chunks = data.draw(n_chunks_strat, label="n_chunks")
if n_chunks is None and not data.draw(st.booleans(), label="pass n_chunks"):
args = []
else:
args = [n_chunks]
df.get_chunks(*args)
50 changes: 49 additions & 1 deletion tests/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@
def test_ci_has_correct_library_params(pytestconfig):
if not pytestconfig.getoption("--ci"):
pytest.skip("only intended for --ci runs")
assert set(libname_to_libinfo.keys()) == {"pandas", "vaex", "modin"}
assert set(libname_to_libinfo.keys()) == {
"pandas",
"vaex",
"modin",
"pyarrow.Table",
"pyarrow.RecordBatch",
}


@given(utf8_strings())
Expand Down Expand Up @@ -50,3 +56,45 @@ def test_strategy(libinfo: LibraryInfo, func_name: str, data: st.DataObject):
def test_frame_equal(libinfo: LibraryInfo, data: st.DataObject):
df = data.draw(libinfo.toplevel_dataframes(), label="df")
assert libinfo.frame_equal(df, df)


def test_pandas_frame_equal_string_object_columns():
try:
import pandas as pd

libinfo = libname_to_libinfo["pandas"]
except (KeyError, ImportError) as e:
pytest.skip(e.msg)
df1 = pd.DataFrame({"foo": ["bar"]})
assert df1["foo"].dtype == object # sanity check
df2 = pd.DataFrame({"foo": pd.Series(["bar"], dtype=pd.StringDtype())})
assert libinfo.frame_equal(df1, df2)
assert libinfo.frame_equal(df2, df1)


@pytest.mark.parametrize("container_name", ["Table", "RecordBatch"])
def test_pyarrow_frame_equal_string_columns(container_name):
try:
import pyarrow as pa

libinfo = libname_to_libinfo[f"pyarrow.{container_name}"]
except (KeyError, ImportError) as e:
pytest.skip(e.msg)

container_class = getattr(pa, container_name)
df1 = container_class.from_pydict(
{
"a": pa.array(["foo"]),
"b": pa.DictionaryArray.from_arrays(pa.array([0]), pa.array(["bar"])),
}
)
df2 = container_class.from_pydict(
{
"a": pa.array(["foo"], type=pa.large_string()),
"b": pa.DictionaryArray.from_arrays(
pa.array([0]), pa.array(["bar"], type=pa.large_string())
),
}
)
assert libinfo.frame_equal(df1, df2)
assert libinfo.frame_equal(df2, df1)
5 changes: 4 additions & 1 deletion tests/test_signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@


def _test_signature(func, stub):
sig = signature(func)
try:
sig = signature(func)
except ValueError:
pytest.skip("Signature not inspectable")
stub_sig = signature(stub)
params = list(sig.parameters.values())
df_stub_params = list(stub_sig.parameters.values())
Expand Down
Loading

0 comments on commit 3b2da6d

Please sign in to comment.