In [54]:
import numpy as np
import polars as pl
from polars_pipeline import Pipeline
from polars_pipeline.model import LightGBM
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
all_df = pl.from_numpy(
    np.concatenate([X, y.reshape(-1, 1)], axis=1),  # type: ignore
    schema={
        "sepal_length": pl.Float32,
        "sepal_width": pl.Float32,
        "petal_length": pl.Float32,
        "petal_width": pl.Float32,
        "species": pl.UInt8,
    },
)
train_df, test_df = train_test_split(all_df, test_size=0.2)
train_df

sepal_length,sepal_width,petal_length,petal_width,species
f32,f32,f32,f32,u8
6.0,3.4,4.5,1.6,1
5.5,2.4,3.8,1.1,1
5.7,3.8,1.7,0.3,0
6.1,2.8,4.0,1.3,1
5.7,4.4,1.5,0.4,0
…,…,…,…,…
5.2,3.4,1.4,0.2,0
5.0,3.0,1.6,0.2,0
6.2,2.2,4.5,1.5,1
5.7,2.6,3.5,1.0,1


In [55]:
model = (
    Pipeline()
    .model.predict(
        LightGBM({"objective": "multiclass", "num_class": 3}), target="species"
    )
    .argmax_horizontal(["species_0", "species_1", "species_2"], name="species")
    .select("species")
)
model.fit_transform(train_df)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000969 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 87
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 4
[LightGBM] [Info] Start training from score -1.073920
[LightGBM] [Info] Start training from score -1.123930
[LightGBM] [Info] Start training from score -1.098612


species
i32
1
1
0
1
0
…
0
0
1
1


In [56]:
from sklearn.metrics import accuracy_score

y_true = test_df["species"]
y_pred = model.transform(test_df)
accuracy_score(y_true, y_pred)

0.9333333333333333