Skip to content

Commit

Permalink
Rudimentary polars wrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
honno committed May 19, 2023
1 parent 56020a5 commit ef80751
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 6 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ jobs:
run: |
pip install ray git+https://github.com/modin-project/modin
pip install vaex # use stable as no nightly builds and long build time
pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --prefer-binary --pre pyarrow --force-reinstall
pip install polars # use stable as no nightly builds(?) and long build time(?)
pip install --pre --extra-index-url https://pypi.fury.io/arrow-nightlies/ pyarrow --ignore-installed --no-deps
pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple pandas --ignore-installed --no-deps
- name: Run tests
run: |
Expand Down
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,11 @@ def pytest_configure(config):
"test_signatures.py::test_buffer_method[cudf-__dlpack_device__]",
# https://github.com/vaexio/vaex/issues/2083
"test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pandas]",
# https://github.com/modin-project/modin/issues/6143
# https://github.com/data-apis/dataframe-interchange-tests/pull/21#issuecomment-1495914398
"test_from_dataframe.py::test_from_dataframe_roundtrip[pyarrow.Table-vaex]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pyarrow.Table]",
# TODO: triage
"test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-polars]",
# https://github.com/rapidsai/cudf/issues/11389
"test_column_object.py::test_dtype[cudf]",
# https://github.com/modin-project/modin/issues/4687
Expand Down
8 changes: 5 additions & 3 deletions tests/test_column_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ class DtypeKind(IntEnum):
NominalDtype.UINT64: "L",
NominalDtype.FLOAT32: "f",
NominalDtype.FLOAT64: "g",
NominalDtype.UTF8: "u",
}


Expand All @@ -79,8 +78,11 @@ def test_dtype(libinfo: LibraryInfo, data: st.DataObject):
assert isinstance(fstring, str)
if mock_col.nominal_dtype == NominalDtype.DATETIME64NS:
assert fstring.startswith("tsn")
# TODO: test categorical format string (i.e. using col's actual dtype)
elif mock_col.nominal_dtype != NominalDtype.CATEGORY:
if mock_col.nominal_dtype == NominalDtype.UTF8:
assert fstring in ["u", "U"]
elif mock_col.nominal_dtype == NominalDtype.CATEGORY:
pass # TODO: test categorical format string (i.e. using col's actual dtype)
else:
assert fstring == NOMINAL_TO_FSTRING[mock_col.nominal_dtype]
assert isinstance(endianness, str)
assert len(endianness) == 1 # TODO: test actual value
Expand Down
1 change: 1 addition & 0 deletions tests/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def test_ci_has_correct_library_params(pytestconfig):
"modin",
"pyarrow.Table",
"pyarrow.RecordBatch",
"polars",
}


Expand Down
58 changes: 57 additions & 1 deletion tests/wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def mock_to_pd_df(mock_df: MockDataFrame) -> pd.DataFrame:
df = pd.concat(serieses, axis=1)
return df

def pandas_frame_equal(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
def pandas_frame_equal(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
# pandas fails equality when an object and string column equal with the
# same values. We don't really care about this, so we normalise any
# string columns as object columns.
Expand Down Expand Up @@ -426,6 +426,61 @@ def pyarrow_from_dataframe_to_batch(_):
return pa_table_libinfo, pa_batch_libinfo


def make_polars_libinfo() -> LibraryInfo:
import polars as pl
from polars.convert import from_dataframe as pl_from_dataframe

def mock_to_pl_df(mock_df: MockDataFrame) -> pl.DataFrame:
if mock_df.ncols == 0:
return pl.DataFrame()
items: list[pl.DataFrame] = []
for name, (array, nominal_dtype) in mock_df.items():
item = pl.from_numpy(array.reshape((1, -1)), [name])
items.append(item)
df = pl.concat(items, how="horizontal")
return df

def pl_frame_equal(df1: pl.DataFrame, df2: pl.DataFrame) -> bool:
# Note pl.DataFrame.frame_equal(...) can't treat NaNs as as equal, and
# assert_frame_equal(...) can't treat nulls as equal.
# We also don't care to distinct NaNs from nulls.
# See https://github.com/apache/arrow/issues/35535#issuecomment-1543482341

if set(df1.columns) != set(df2.columns):
return False

for col in df1.columns:
s1 = df1[col]
s2 = df2[col]

if s1.dtype != s2.dtype:
return False

na_mask1 = s1.is_null()
na_mask2 = s2.is_null()
if s1.is_float():
na_mask1 |= s1.is_nan()
na_mask2 |= s2.is_nan()
if not (na_mask1 == na_mask2).all():
return False

if not (s1[~na_mask1] == s2[~na_mask1]).all():
return False

return True

return LibraryInfo(
name="polars",
mock_to_toplevel=mock_to_pl_df,
from_dataframe=pl_from_dataframe,
frame_equal=pl_frame_equal,
# TODO: support testing categoricals
supported_dtypes=set(NominalDtype) ^ {NominalDtype.CATEGORY},
# https://github.com/pola-rs/polars/issues/8884
allow_zero_cols=False,
)


# ------------------------------------------------------- End wrapping libraries


Expand All @@ -437,6 +492,7 @@ def pyarrow_from_dataframe_to_batch(_):
("vaex", make_vaex_libinfo),
("modin", make_modin_libinfo),
("cudf", make_cudf_libinfo),
("polars", make_polars_libinfo),
]:
try:
libinfo = libinfo_factory()
Expand Down

0 comments on commit ef80751

Please sign in to comment.