Rudimentary polars wrapper

data-apis · May 19, 2023 · ef80751 · ef80751
1 parent 56020a5
commit ef80751
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 6 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -23,7 +23,8 @@ jobs:
       run: |
         pip install ray git+https://github.com/modin-project/modin
         pip install vaex  # use stable as no nightly builds and long build time
-        pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --prefer-binary --pre pyarrow --force-reinstall
+        pip install polars  # use stable as no nightly builds(?) and long build time(?)
+        pip install --pre --extra-index-url https://pypi.fury.io/arrow-nightlies/ pyarrow --ignore-installed --no-deps
         pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple pandas --ignore-installed --no-deps
     - name: Run tests
       run: |

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -44,10 +44,11 @@ def pytest_configure(config):
     "test_signatures.py::test_buffer_method[cudf-__dlpack_device__]",
     # https://github.com/vaexio/vaex/issues/2083
     "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pandas]",
-    # https://github.com/modin-project/modin/issues/6143
     # https://github.com/data-apis/dataframe-interchange-tests/pull/21#issuecomment-1495914398
     "test_from_dataframe.py::test_from_dataframe_roundtrip[pyarrow.Table-vaex]",
     "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pyarrow.Table]",
+    # TODO: triage
+    "test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-polars]",
     # https://github.com/rapidsai/cudf/issues/11389
     "test_column_object.py::test_dtype[cudf]",
     # https://github.com/modin-project/modin/issues/4687

diff --git a/tests/test_column_object.py b/tests/test_column_object.py
@@ -62,7 +62,6 @@ class DtypeKind(IntEnum):
     NominalDtype.UINT64: "L",
     NominalDtype.FLOAT32: "f",
     NominalDtype.FLOAT64: "g",
-    NominalDtype.UTF8: "u",
 }
 
 
@@ -79,8 +78,11 @@ def test_dtype(libinfo: LibraryInfo, data: st.DataObject):
     assert isinstance(fstring, str)
     if mock_col.nominal_dtype == NominalDtype.DATETIME64NS:
         assert fstring.startswith("tsn")
-    # TODO: test categorical format string (i.e. using col's actual dtype)
-    elif mock_col.nominal_dtype != NominalDtype.CATEGORY:
+    if mock_col.nominal_dtype == NominalDtype.UTF8:
+        assert fstring in ["u", "U"]
+    elif mock_col.nominal_dtype == NominalDtype.CATEGORY:
+        pass  # TODO: test categorical format string (i.e. using col's actual dtype)
+    else:
         assert fstring == NOMINAL_TO_FSTRING[mock_col.nominal_dtype]
     assert isinstance(endianness, str)
     assert len(endianness) == 1  # TODO: test actual value

diff --git a/tests/test_meta.py b/tests/test_meta.py
@@ -19,6 +19,7 @@ def test_ci_has_correct_library_params(pytestconfig):
         "modin",
         "pyarrow.Table",
         "pyarrow.RecordBatch",
+        "polars",
     }
 
 

diff --git a/tests/wrappers.py b/tests/wrappers.py
@@ -104,7 +104,7 @@ def mock_to_pd_df(mock_df: MockDataFrame) -> pd.DataFrame:
         df = pd.concat(serieses, axis=1)
         return df
 
-    def pandas_frame_equal(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
+    def pandas_frame_equal(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
         # pandas fails equality when an object and string column equal with the
         # same values. We don't really care about this, so we normalise any
         # string columns as object columns.
@@ -426,6 +426,61 @@ def pyarrow_from_dataframe_to_batch(_):
     return pa_table_libinfo, pa_batch_libinfo
 
 
+def make_polars_libinfo() -> LibraryInfo:
+    import polars as pl
+    from polars.convert import from_dataframe as pl_from_dataframe
+
+    def mock_to_pl_df(mock_df: MockDataFrame) -> pl.DataFrame:
+        if mock_df.ncols == 0:
+            return pl.DataFrame()
+        items: list[pl.DataFrame] = []
+        for name, (array, nominal_dtype) in mock_df.items():
+            item = pl.from_numpy(array.reshape((1, -1)), [name])
+            items.append(item)
+        df = pl.concat(items, how="horizontal")
+        return df
+
+    def pl_frame_equal(df1: pl.DataFrame, df2: pl.DataFrame) -> bool:
+        # Note pl.DataFrame.frame_equal(...) can't treat NaNs as as equal, and
+        # assert_frame_equal(...) can't treat nulls as equal.
+        # We also don't care to distinct NaNs from nulls.
+        # See https://github.com/apache/arrow/issues/35535#issuecomment-1543482341
+
+        if set(df1.columns) != set(df2.columns):
+            return False
+
+        for col in df1.columns:
+            s1 = df1[col]
+            s2 = df2[col]
+
+            if s1.dtype != s2.dtype:
+                return False
+
+            na_mask1 = s1.is_null()
+            na_mask2 = s2.is_null()
+            if s1.is_float():
+                na_mask1 |= s1.is_nan()
+                na_mask2 |= s2.is_nan()
+            if not (na_mask1 == na_mask2).all():
+                return False
+
+            if not (s1[~na_mask1] == s2[~na_mask1]).all():
+                return False
+
+        return True
+
+    return LibraryInfo(
+        name="polars",
+        mock_to_toplevel=mock_to_pl_df,
+        from_dataframe=pl_from_dataframe,
+        frame_equal=pl_frame_equal,
+        # TODO: support testing categoricals
+        supported_dtypes=set(NominalDtype) ^ {NominalDtype.CATEGORY},
+        # https://github.com/pola-rs/polars/issues/8884
+        allow_zero_cols=False,
+    )
+
+
 # ------------------------------------------------------- End wrapping libraries
 
 
@@ -437,6 +492,7 @@ def pyarrow_from_dataframe_to_batch(_):
     ("vaex", make_vaex_libinfo),
     ("modin", make_modin_libinfo),
     ("cudf", make_cudf_libinfo),
+    ("polars", make_polars_libinfo),
 ]:
     try:
         libinfo = libinfo_factory()