In [1]:
from typing import Any, Dict

import numpy as np
import polars as pl
from polars import DataFrame
from polars_pipeline import Pipeline
from polars_pipeline.model import LightGBM
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
all_df = pl.from_numpy(
    np.concatenate([X, y.reshape(-1, 1)], axis=1),  # type: ignore
    schema={
        "sepal_length": pl.Float32,
        "sepal_width": pl.Float32,
        "petal_length": pl.Float32,
        "petal_width": pl.Float32,
        "species": pl.UInt8,
    },
)
train_df, test_df = train_test_split(all_df, test_size=0.2)
train_df

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  from .autonotebook import tqdm as notebook_tqdm


sepal_length,sepal_width,petal_length,petal_width,species
f32,f32,f32,f32,u8
5.8,2.8,5.1,2.4,2
6.0,2.2,5.0,1.5,2
4.9,2.5,4.5,1.7,2
5.7,2.8,4.1,1.3,1
6.4,2.9,4.3,1.3,1
…,…,…,…,…
5.9,3.0,5.1,1.8,2
5.4,3.4,1.5,0.4,0
6.1,2.6,5.6,1.4,2
6.7,3.1,4.7,1.5,1


In [2]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold


def metrics(y_true: DataFrame, y_pred: DataFrame) -> Dict[str, Any]:
    y_pred = (
        Pipeline()
        .argmax_horizontal(["species_0", "species_1", "species_2"], name="species")
        .select("species")
        .transform(y_pred)
    )
    return {"accuracy": accuracy_score(y_true, y_pred)}


model = (
    Pipeline(log_dir="./log")
    .model.predict(
        Pipeline().model.stack(
            LightGBM({"objective": "multiclass", "num_class": 3}),
            fold=StratifiedKFold(n_splits=5),
            metrics_fn=metrics,
        ),
        target="species",
    )
    .argmax_horizontal(["species_0", "species_1", "species_2"], name="species")
    .select("species")
)
model.fit_transform(train_df)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 4
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.130361
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 4
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Start training from score -1.163151
[LightGBM] [Info] Start training from score -1.067841
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000026 seconds.
You ca

species
i32
2
1
1
1
1
…
2
0
1
1


In [3]:
y_true = test_df["species"]
y_pred = model.transform(test_df)
accuracy_score(y_true, y_pred)

0.9666666666666667