In [1]:
import copy
import datetime
import functools
import inspect
import itertools
import json
import logging
import math
import os
import time
from pathlib import Path
from typing import (
    Any,
    Callable,
    Collection,
    Iterable,
    Iterator,
    Mapping,
    Optional,
    Sequence,
    Union,
)

import catboost as cat
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import polars as pl
import polars.selectors as cs
import seaborn as sns
import xgboost as xgb
from polars import DataFrame, Expr, Series
from sklearn.datasets import fetch_covtype
from tqdm import tqdm

import polars_ml as pml
from polars_ml import Pipeline
from polars_ml.model_selection import KFold, train_test_split
from polars_ml.model_selection.metrics import evaluate_classification_metrics

bunch = fetch_covtype(as_frame=True)
df: DataFrame = pl.from_pandas(bunch.data).with_columns(pl.from_pandas(bunch.target))  # type: ignore
df

  from .autonotebook import tqdm as notebook_tqdm


Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area_0,Wilderness_Area_1,Wilderness_Area_2,Wilderness_Area_3,Soil_Type_0,Soil_Type_1,Soil_Type_2,Soil_Type_3,Soil_Type_4,Soil_Type_5,Soil_Type_6,Soil_Type_7,Soil_Type_8,Soil_Type_9,Soil_Type_10,Soil_Type_11,Soil_Type_12,Soil_Type_13,Soil_Type_14,Soil_Type_15,Soil_Type_16,Soil_Type_17,Soil_Type_18,Soil_Type_19,Soil_Type_20,Soil_Type_21,Soil_Type_22,Soil_Type_23,Soil_Type_24,Soil_Type_25,Soil_Type_26,Soil_Type_27,Soil_Type_28,Soil_Type_29,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_Type
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2396.0,153.0,20.0,85.0,17.0,108.0,240.0,237.0,118.0,837.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2391.0,152.0,19.0,67.0,12.0,95.0,240.0,237.0,119.0,845.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2386.0,159.0,17.0,60.0,7.0,90.0,236.0,241.0,130.0,854.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2384.0,170.0,15.0,60.0,5.0,90.0,230.0,245.0,143.0,864.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [2]:
pp = Pipeline(
    lambda df: (
        df.with_columns(
            pl.concat_str(
                cs.starts_with("Soil_Type_").cast(pl.UInt8).cast(pl.String)
            ).alias("Soil_Type"),
            pl.concat_str(
                cs.starts_with("Wilderness_Area_").cast(pl.UInt8).cast(pl.String)
            ).alias("Wilderness_Area"),
            pl.col("Cover_Type").cast(pl.String),
        ).drop(cs.starts_with("Soil_Type_"), cs.starts_with("Wilderness_Area_"))
    ),
    pml.LabelEncoder("Soil_Type", "Wilderness_Area"),
)

numerical_columns = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]
categorical_columns = ["Soil_Type", "Wilderness_Area"]
target_column = "Cover_Type"
pp.fit_transform(df)

Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type,Soil_Type,Wilderness_Area
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,u32,u32
2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,"""5""",0,0
2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,"""5""",0,0
2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,"""2""",1,0
2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,"""2""",2,0
2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,"""5""",0,0
…,…,…,…,…,…,…,…,…,…,…,…,…
2396.0,153.0,20.0,85.0,17.0,108.0,240.0,237.0,118.0,837.0,"""3""",25,1
2391.0,152.0,19.0,67.0,12.0,95.0,240.0,237.0,119.0,845.0,"""3""",25,1
2386.0,159.0,17.0,60.0,7.0,90.0,236.0,241.0,130.0,854.0,"""3""",25,1
2384.0,170.0,15.0,60.0,5.0,90.0,230.0,245.0,143.0,864.0,"""3""",25,1


In [3]:
if False:
    data = pp.fit_transform(df)
    out_dir = Path("out/scatter")
    out_dir.mkdir(parents=True, exist_ok=True)
    for x, y, hue in tqdm(
        pml.iter_axes(data, numerical_columns, numerical_columns, target_column)
    ):
        fig, ax = plt.subplots()
        sns.scatterplot(
            data=data.sample(10000),
            x=x,
            y=y,
            hue=hue,
            ax=ax,
            s=10,
            edgecolor=None,
            alpha=0.5,
        )
        title = f"{x} vs {y} by {hue}"

        plt.title(title)
        plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
        plt.tight_layout()

        plt.savefig(out_dir / f"{title}.png")
        plt.close(fig)


In [4]:
metrics = []
for train_idx, valid_idx in KFold(
    n_splits=5, shuffle=True, seed=42, stratify=target_column
).split(df):
    train_df = df.select(pl.all().gather(train_idx))
    valid_df = df.select(pl.all().gather(valid_idx))

    model = Pipeline(
        lambda df: (
            df.with_columns(
                pl.concat_str(
                    cs.starts_with("Soil_Type_").cast(pl.UInt8).cast(pl.String)
                ).alias("Soil_Type"),
                pl.concat_str(
                    cs.starts_with("Wilderness_Area_").cast(pl.UInt8).cast(pl.String)
                ).alias("Wilderness_Area"),
                pl.col("Cover_Type").cast(pl.String),
            ).drop(cs.starts_with("Soil_Type_"), cs.starts_with("Wilderness_Area_"))
        ),
        pml.LabelEncoder("Soil_Type", "Wilderness_Area", target_column),
        pml.gbdt.LightGBM(
            target_column,
            {
                "objective": "multiclass",
                "num_class": 8,
            },
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(10)],
        ),
        pml.HorizontalArgMax(cs.starts_with("lightgbm_"), value_name="lightgbm"),
        lambda df: df.with_columns(
            pl.col("lightgbm")
            .list.first()
            .str.extract(r"lightgbm_(\d+)")
            .cast(pl.UInt8)
        ),
    )

    model.fit(train_df, valid_df)
    valid_pred_df = model.transform(valid_df)
    metrics.append(
        evaluate_classification_metrics(
            valid_pred_df,
            "Cover_Type",
            y_pred_class="lightgbm",
            y_pred_proba_prefix="lightgbm_",
            n_classes=8,
        )
    )

pl.concat(metrics)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2220
[LightGBM] [Info] Number of data points in the train set: 464807, number of used features: 12
[LightGBM] [Info] Start training from score -4.114264
[LightGBM] [Info] Start training from score -0.718259
[LightGBM] [Info] Start training from score -1.008935
[LightGBM] [Info] Start training from score -3.343853
[LightGBM] [Info] Start training from score -2.788111
[LightGBM] [Info] Start training from score -3.510237
[LightGBM] [Info] Start training from score -5.354529
[LightGBM] [Info] Start training from score -34.538776
Training until validation scores don't improve for 100 rounds
[10]	train's multi_logloss: 0.623967	valid's multi_logloss: 0.635864
[20]	train's multi_logloss: 0.513368	valid's multi_logloss: 0.526574
[30]	train's 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2224
[LightGBM] [Info] Number of data points in the train set: 464808, number of used features: 12
[LightGBM] [Info] Start training from score -4.114266
[LightGBM] [Info] Start training from score -0.718257
[LightGBM] [Info] Start training from score -1.008937
[LightGBM] [Info] Start training from score -3.343855
[LightGBM] [Info] Start training from score -2.788113
[LightGBM] [Info] Start training from score -3.510239
[LightGBM] [Info] Start training from score -5.354532
[LightGBM] [Info] Start training from score -34.538776
Training until validation scores don't improve for 100 rounds
[10]	train's multi_logloss: 0.617628	valid's multi_logloss: 0.631968
[20]	train's multi_logloss: 0.511063	valid's multi_logloss: 0.52964
[30]	train's m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2225
[LightGBM] [Info] Number of data points in the train set: 464810, number of used features: 12
[LightGBM] [Info] Start training from score -4.114270
[LightGBM] [Info] Start training from score -0.718261
[LightGBM] [Info] Start training from score -1.008941
[LightGBM] [Info] Start training from score -3.343860
[LightGBM] [Info] Start training from score -2.788117
[LightGBM] [Info] Start training from score -3.510172
[LightGBM] [Info] Start training from score -5.354081
[LightGBM] [Info] Start training from score -34.538776
Training until validation scores don't improve for 100 rounds
[10]	train's multi_logloss: 0.610512	valid's multi_logloss: 0.619971
[20]	train's multi_logloss: 0.503156	valid's multi_logloss: 0.515175
[30]	train's 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2214
[LightGBM] [Info] Number of data points in the train set: 464811, number of used features: 12
[LightGBM] [Info] Start training from score -4.114141
[LightGBM] [Info] Start training from score -0.718264
[LightGBM] [Info] Start training from score -1.008943
[LightGBM] [Info] Start training from score -3.343862
[LightGBM] [Info] Start training from score -2.788119
[LightGBM] [Info] Start training from score -3.510174
[LightGBM] [Info] Start training from score -5.354083
[LightGBM] [Info] Start training from score -34.538776
Training until validation scores don't improve for 100 rounds
[10]	train's multi_logloss: 0.615729	valid's multi_logloss: 0.623069
[20]	train's multi_logloss: 0.507751	valid's multi_logloss: 0.517891
[30]	train's multi_logloss: 0.460279	valid's multi_logloss: 0.471675
[40]	tra

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2220
[LightGBM] [Info] Number of data points in the train set: 464812, number of used features: 12
[LightGBM] [Info] Start training from score -4.114143
[LightGBM] [Info] Start training from score -0.718266
[LightGBM] [Info] Start training from score -1.008945
[LightGBM] [Info] Start training from score -3.343864
[LightGBM] [Info] Start training from score -2.788086
[LightGBM] [Info] Start training from score -3.510176
[LightGBM] [Info] Start training from score -5.354085
[LightGBM] [Info] Start training from score -34.538776
Training until validation scores don't improve for 100 rounds
[10]	train's multi_logloss: 0.617236	valid's multi_logloss: 0.62706
[20]	train's multi_logloss: 0.508847	valid's multi_logloss: 0.522102
[30]	train's m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


accuracy,balanced_accuracy,precision_macro,precision_weighted,recall_macro,recall_weighted,f1_macro,f1_weighted,matthews_corrcoef,cohen_kappa_score,roc_auc_ovo,log_loss
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.86566,0.827019,0.751452,0.865613,0.723641,0.86566,0.736036,0.865231,0.783299,0.783008,0.978161,0.368811
0.845014,0.792508,0.735857,0.844781,0.693444,0.845014,0.711916,0.844274,0.749685,0.749337,0.974059,0.411544
0.853961,0.808271,0.746022,0.853695,0.707237,0.853961,0.72426,0.853295,0.764262,0.763952,0.981816,0.382002
0.852635,0.803598,0.740624,0.852515,0.703148,0.852635,0.718993,0.851928,0.762072,0.761708,0.979646,0.402664
0.859484,0.81661,0.749931,0.859439,0.714533,0.859484,0.729487,0.858847,0.773275,0.772945,0.979394,0.388424


In [5]:
def make_model(
    *,
    learning_rate: float,
    num_leaves: int,
    max_depth: int,
    feature_fraction: float,
    min_data_in_leaf: int,
    bagging_fraction: float,
    lambda_l1: float,
    lambda_l2: float,
    verbose: int = -1,
    trial: Optional["optuna.Trial"] = None,
) -> pml.Component:
    return Pipeline(
        lambda df: (
            df.with_columns(
                pl.concat_str(
                    cs.starts_with("Soil_Type_").cast(pl.UInt8).cast(pl.String)
                ).alias("Soil_Type"),
                pl.concat_str(
                    cs.starts_with("Wilderness_Area_").cast(pl.UInt8).cast(pl.String)
                ).alias("Wilderness_Area"),
                pl.col("Cover_Type").cast(pl.String),
            ).drop(cs.starts_with("Soil_Type_"), cs.starts_with("Wilderness_Area_"))
        ),
        pml.LabelEncoder("Soil_Type", "Wilderness_Area", target_column),
        pml.gbdt.LightGBM(
            target_column,
            {
                "objective": "multiclass",
                "num_class": 8,
                "learning_rate": learning_rate,
                "num_leaves": num_leaves,
                "max_depth": max_depth,
                "feature_fraction": feature_fraction,
                "min_data_in_leaf": min_data_in_leaf,
                "bagging_fraction": bagging_fraction,
                "lambda_l1": lambda_l1,
                "lambda_l2": lambda_l2,
                "verbose": verbose,
            },
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(10)],
        ),
        pml.HorizontalArgMax(cs.starts_with("lightgbm_"), value_name="lightgbm"),
        lambda df: df.with_columns(
            pl.col("lightgbm")
            .list.first()
            .str.extract(r"lightgbm_(\d+)")
            .cast(pl.UInt8)
        ),
    )


def objective(
    model: pml.Component,
    data: DataFrame,
    validation_data: DataFrame | Mapping[str, DataFrame] | None = None,
    *,
    trial: Optional["optuna.Trial"] = None,
) -> float:
    acc_list = []
    for train_idx, valid_idx in KFold(
        n_splits=5, shuffle=True, seed=42, stratify=target_column
    ).split(data):
        train_df = data.select(pl.all().gather(train_idx))
        valid_df = data.select(pl.all().gather(valid_idx))
        model = copy.deepcopy(model)
        model.fit(train_df, valid_df)
        valid_pred_df = model.transform(valid_df).select(target_column, "lightgbm")
        acc_list.append(
            valid_pred_df.select(pl.col(target_column) == pl.col("lightgbm"))
            .mean()
            .item()
        )
    return float(np.mean(acc_list))


train_idx, valid_idx = train_test_split(
    df, test_size=0.2, seed=42, shuffle=True, stratify=target_column
)
train_df = df.select(pl.all().gather(train_idx))
valid_df = df.select(pl.all().gather(valid_idx))

optimizer = pml.optimize.OptunaOptimizer(
    make_model,
    objective,
    {
        "learning_rate": {"min": 0.001, "max": 0.2, "log": True},
        "num_leaves": {"min": 10, "max": 100},
        "max_depth": {"min": 3, "max": 12},
        "feature_fraction": {"min": 0.5, "max": 1.0},
        "min_data_in_leaf": {"min": 20, "max": 200},
        "bagging_fraction": {"min": 0.6, "max": 1.0},
        "lambda_l1": {"min": 0.0, "max": 10.0},
        "lambda_l2": {"min": 0.0, "max": 10.0},
    },
    storage="out/journal.log",
    is_higher_better=True,
    n_trials=100,
)

optimizer.fit(train_df, valid_df)
valid_pred_df = model.transform(valid_df)
metrics = evaluate_classification_metrics(
    valid_pred_df,
    "Cover_Type",
    y_pred_class="lightgbm",
    y_pred_proba_prefix="lightgbm_",
    n_classes=8,
)

[I 2025-04-29 22:25:10,600] A new study created in Journal with name: no-name-4edafa3e-a87b-4361-b9f4-2bbd8f4d0026


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1000]	train's multi_logloss: 0.287862	valid's multi_logloss: 0.306318
Training until validation scores don't improve for 10 rounds


[W 2025-04-29 22:28:22,377] Trial 0 failed with parameters: {'learning_rate': 0.019996922171467656, 'num_leaves': 51, 'max_depth': 6, 'feature_fraction': 0.9514619247114321, 'min_data_in_leaf': 87, 'bagging_fraction': 0.6389006416799826, 'lambda_l1': 3.6049061095067616, 'lambda_l2': 2.4274513894359053} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/hoge/polars-ml/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/home/hoge/polars-ml/src/polars_ml/optimize/optuna_.py", line 104, in _objective
    return objective(
  File "/tmp/ipykernel_8743/3616443809.py", line 69, in objective
    model.fit(train_df, valid_df)
  File "/home/hoge/polars-ml/src/polars_ml/pipeline.py", line 23, in fit
    data = component.fit_transform(data, validation_data)
  File "/home/hoge/polars-ml/src/polars_ml/component.py", line 23, in fit_transform
    return self.fit(data, valida

KeyboardInterrupt: 

In [18]:
from autogluon.tabular import TabularPredictor

train_idx, valid_idx = train_test_split(
    df, test_size=0.2, seed=42, shuffle=True, stratify=target_column
)
train_df = df.select(pl.all().gather(train_idx))
valid_df = df.select(pl.all().gather(valid_idx))


pre = Pipeline(
    lambda df: (
        df.with_columns(
            pl.concat_str(
                cs.starts_with("Soil_Type_").cast(pl.UInt8).cast(pl.String)
            ).alias("Soil_Type"),
            pl.concat_str(
                cs.starts_with("Wilderness_Area_").cast(pl.UInt8).cast(pl.String)
            ).alias("Wilderness_Area"),
            pl.col("Cover_Type").cast(pl.String),
        ).drop(cs.starts_with("Soil_Type_"), cs.starts_with("Wilderness_Area_"))
    ),
)
train_df = pre.fit_transform(train_df)
valid_df = pre.transform(valid_df)

predictor = TabularPredictor(
    label=target_column,
    eval_metric="accuracy",
    problem_type="multiclass",
    path="out/autogluon",
    verbosity=2,
).fit(
    train_df.to_pandas(),
    tuning_data=valid_df.to_pandas(),
    time_limit=60 * 60 * 6,
    # presets="best_quality",
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct 5 21:02:42 UTC 2023
CPU Count:          12
Memory Avail:       29.17 GB / 47.05 GB (62.0%)
Disk Space Avail:   885.00 GB / 1006.85 GB (87.9%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with very fast inferen

[1000]	valid_set's multi_error: 0.105688
[2000]	valid_set's multi_error: 0.0811962
[3000]	valid_set's multi_error: 0.0688296
[4000]	valid_set's multi_error: 0.0612392
[5000]	valid_set's multi_error: 0.0560929
[6000]	valid_set's multi_error: 0.0522719
[7000]	valid_set's multi_error: 0.0493976
[8000]	valid_set's multi_error: 0.0472892
[9000]	valid_set's multi_error: 0.0450775
[10000]	valid_set's multi_error: 0.0434423


	0.9566	 = Validation score   (accuracy)
	592.31s	 = Training   runtime
	202.3s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 20546.94s of the 20546.94s of remaining time.


[1000]	valid_set's multi_error: 0.0687694
[2000]	valid_set's multi_error: 0.046988
[3000]	valid_set's multi_error: 0.0375731
[4000]	valid_set's multi_error: 0.0336919
[5000]	valid_set's multi_error: 0.0315404
[6000]	valid_set's multi_error: 0.0299828
[7000]	valid_set's multi_error: 0.0286919
[8000]	valid_set's multi_error: 0.0280809
[9000]	valid_set's multi_error: 0.0275818
[10000]	valid_set's multi_error: 0.0275215


	0.9726	 = Validation score   (accuracy)
	464.0s	 = Training   runtime
	122.28s	 = Validation runtime
Fitting model: RandomForestGini ... Training model for up to 19959.00s of the 19959.00s of remaining time.
	0.9504	 = Validation score   (accuracy)
	40.23s	 = Training   runtime
	0.71s	 = Validation runtime
Fitting model: RandomForestEntr ... Training model for up to 19915.59s of the 19915.59s of remaining time.
	0.958	 = Validation score   (accuracy)
	43.51s	 = Training   runtime
	0.69s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 19869.08s of the 19869.08s of remaining time.
	0.9396	 = Validation score   (accuracy)
	6744.8s	 = Training   runtime
	0.83s	 = Validation runtime
Fitting model: ExtraTreesGini ... Training model for up to 13123.14s of the 13123.13s of remaining time.
	0.91	 = Validation score   (accuracy)
	18.95s	 = Training   runtime
	0.69s	 = Validation runtime
Fitting model: ExtraTreesEntr ... Training model for up to 13101.74s of the 13101.

In [19]:
leaderboard = predictor.leaderboard()
pl.from_pandas(leaderboard)

model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
str,f64,str,f64,f64,f64,f64,i64,bool,i64
"""WeightedEnsemble_L2""",0.976446,"""accuracy""",335.815542,11101.672282,0.008856,3.148657,2,true,13
"""LightGBM""",0.972565,"""accuracy""",122.279317,463.99586,122.279317,463.99586,1,true,5
"""XGBoost""",0.970439,"""accuracy""",211.110487,10588.124225,211.110487,10588.124225,1,true,11
"""KNeighborsDist""",0.969966,"""accuracy""",0.868624,1.430939,0.868624,1.430939,1,true,2
"""KNeighborsUnif""",0.968322,"""accuracy""",0.861417,1.460242,0.861417,1.460242,1,true,1
…,…,…,…,…,…,…,…,…,…
"""NeuralNetTorch""",0.939759,"""accuracy""",0.309721,2273.483238,0.309721,2273.483238,1,true,12
"""CatBoost""",0.939613,"""accuracy""",0.830587,6744.798803,0.830587,6744.798803,1,true,8
"""ExtraTreesEntr""",0.913021,"""accuracy""",0.819788,18.75014,0.819788,18.75014,1,true,10
"""NeuralNetFastAI""",0.912522,"""accuracy""",0.737899,248.807499,0.737899,248.807499,1,true,3


In [20]:
detailed_leaderboard = predictor.leaderboard(extra_info=True)
pl.from_pandas(detailed_leaderboard)

If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")


model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order,num_features,num_models,num_models_w_ancestors,memory_size,memory_size_w_ancestors,memory_size_min,memory_size_min_w_ancestors,num_ancestors,num_descendants,model_type,child_model_type,stopping_metric,hyperparameters,hyperparameters_fit,ag_args_fit,features,compile_time,child_hyperparameters,child_hyperparameters_fit,child_ag_args_fit,ancestors,descendants
str,f64,str,f64,f64,f64,f64,i64,bool,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,struct[51],struct[7],struct[13],list[str],str,struct[2],struct[1],struct[13],list[str],list[str]
"""WeightedEnsemble_L2""",0.976446,"""accuracy""",335.815542,11101.672282,0.008856,3.148657,2,true,13,35,1,6,7824,1630249198,7824,1080078580,5,0,"""WeightedEnsembleModel""","""GreedyWeightedEnsembleModel""","""accuracy""","{null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0,""auto"",null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,true,null,null,null,false,true,null,null,null,null}","{null,null,null,null,null,null,null}","{false,null,null,null,null,1.0,null,1.0,0,null,null,null,null}","[""LightGBM_0"", ""XGBoost_1"", … ""LightGBM_6""]",,"{25,1000000}",{19},"{false,null,null,null,null,1.0,null,1.0,0,null,null,null,null}","[""LightGBM"", ""KNeighborsDist"", … ""XGBoost""]",[]
"""LightGBM""",0.972565,"""accuracy""",122.279317,463.99586,122.279317,463.99586,1,true,5,12,1,1,220661609,220661609,220661609,220661609,0,1,"""LGBModel""",,"""accuracy""","{null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.05,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null}","{null,null,null,null,null,9343,null}","{null,null,null,null,null,1.0,null,1.0,0,null,null,[""bool"", ""int"", … ""category""],null}","[""Elevation"", ""Aspect"", … ""Wilderness_Area""]",,,,,[],"[""WeightedEnsemble_L2""]"
"""XGBoost""",0.970439,"""accuracy""",211.110487,10588.124225,211.110487,10588.124225,1,true,11,12,1,1,196774764,196774764,196774764,196774764,0,1,"""XGBoostModel""",,"""accuracy""","{null,null,""gbtree"",null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.1,null,null,null,null,null,null,null,10000,-1,7,null,null,""multi:softprob"",null,null,null,100,null,null,null,null,null,null,null,null,null,null,null,null,null,null}","{null,null,null,null,9976,null,null}","{null,null,null,null,null,1.0,null,1.0,0,null,null,[""bool"", ""int"", … ""category""],null}","[""Elevation"", ""Aspect"", … ""Wilderness_Area""]",,,,,[],"[""WeightedEnsemble_L2""]"
"""KNeighborsDist""",0.969966,"""accuracy""",0.868624,1.430939,0.868624,1.430939,1,true,2,10,1,1,66363211,66363211,66363211,66363211,0,1,"""KNNModel""",,"""accuracy""","{null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,""distance"",null,null}","{null,null,null,null,null,null,null}","{null,null,null,null,[""bool""],1.0,null,1.0,0,null,null,[""int"", ""float""],null}","[""Elevation"", ""Aspect"", … ""Horizontal_Distance_To_Fire_Points""]",,,,,[],"[""WeightedEnsemble_L2""]"
"""KNeighborsUnif""",0.968322,"""accuracy""",0.861417,1.460242,0.861417,1.460242,1,true,1,10,1,1,66363210,66363210,66363210,66363210,0,1,"""KNNModel""",,"""accuracy""","{null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,""uniform"",null,null}","{null,null,null,null,null,null,null}","{null,null,null,null,[""bool""],1.0,null,1.0,0,null,null,[""int"", ""float""],null}","[""Elevation"", ""Aspect"", … ""Horizontal_Distance_To_Fire_Points""]",,,,,[],"[""WeightedEnsemble_L2""]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""NeuralNetTorch""",0.939759,"""accuracy""",0.309721,2273.483238,0.309721,2273.483238,1,true,12,12,1,1,280374,280374,280374,280374,0,0,"""TabularNeuralNetTorchModel""",,"""accuracy""","{""relu"",null,null,null,null,null,0.1,null,null,null,0.56,1.0,null,null,null,null,128,null,null,0.0003,""auto"",null,null,null,512,100,null,null,null,null,1000,4,null,""adam"",4,""median"",100,0.99,null,null,null,null,null,false,false,null,null,0.000001,null,null,0.05}","{256,null,null,null,null,null,330}","{null,null,null,null,[""text_ngram"", ""text_as_category""],1.0,null,1.0,0,null,null,[""bool"", ""int"", … ""category""],null}","[""Elevation"", ""Aspect"", … ""Wilderness_Area""]",,,,,[],[]
"""CatBoost""",0.939613,"""accuracy""",0.830587,6744.798803,0.830587,6744.798803,1,true,8,12,1,1,43572970,43572970,43572970,43572970,0,0,"""CatBoostModel""",,"""accuracy""","{null,false,null,null,null,null,null,null,null,null,null,null,null,null,""Accuracy"",null,null,10000,null,0.05,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0,null,null,null,null,null,null,null,null,null,null,null}","{null,null,null,9994,null,null,null}","{null,null,null,null,null,1.0,null,1.0,0,null,null,[""bool"", ""int"", … ""category""],null}","[""Elevation"", ""Aspect"", … ""Wilderness_Area""]",,,,,[],[]
"""ExtraTreesEntr""",0.913021,"""accuracy""",0.819788,18.75014,0.819788,18.75014,1,true,10,12,1,1,1080078574,1080078574,1080078574,1080078574,0,0,"""XTModel""",,"""accuracy""","{null,null,null,true,null,""entropy"",null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,15000,300,-1,null,null,null,null,null,null,null,null,null,null,null,0,null,null,null,null,null,null,null,null,null,null}","{null,null,null,null,300,null,null}","{null,null,null,null,null,1.0,null,1.0,0,null,null,[""bool"", ""int"", … ""category""],null}","[""Elevation"", ""Aspect"", … ""Wilderness_Area""]",,,,,[],[]
"""NeuralNetFastAI""",0.912522,"""accuracy""",0.737899,248.807499,0.737899,248.807499,1,true,3,12,1,1,1074639,1074639,1074639,1074639,0,0,"""NNFastAiTabularModel""",,"""accuracy""","{null,null,null,null,""auto"",null,null,0.0001,20,0.1,null,null,""auto"",null,null,null,null,null,null,null,null,0.01,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.1,null,null,null,0.0,null,null,null,null,null,null,null,null}","{null,27,30,null,null,null,null}","{null,null,null,null,[""text_ngram"", ""text_as_category""],1.0,null,1.0,0,null,null,[""bool"", ""int"", … ""category""],null}","[""Elevation"", ""Aspect"", … ""Wilderness_Area""]",,,,,[],[]
