In [1]:
%reload_ext autoreload
%autoreload 2

import warnings

import polars as pl
from sklearn import set_config

warnings.filterwarnings("ignore")

_ = pl.Config.set_tbl_rows(5)
_ = pl.Config.set_tbl_cols(20)

set_config(transform_output="polars")

In [2]:
import polars as pl

train_df = pl.read_csv(
    "https://huggingface.co/datasets/inria-soda/tabular-benchmark/resolve/main/reg_cat/diamonds.csv"
)

train_df.head()

carat,cut,color,clarity,depth,table,x,y,z,price
f64,i64,i64,i64,f64,f64,f64,f64,f64,f64
0.23,2,1,3,61.5,55.0,3.95,3.98,2.43,5.78996
0.21,3,1,2,59.8,61.0,3.89,3.84,2.31,5.78996
0.23,1,1,4,56.9,65.0,4.05,4.07,2.31,5.793014
0.29,3,5,5,62.4,58.0,4.2,4.23,2.63,5.814131
0.31,1,6,3,63.3,58.0,4.34,4.35,2.75,5.817111


In [3]:
# Cast column to correct types
train_df = train_df.with_columns(
    pl.col("cut").cast(pl.String).cast(pl.Categorical),
    pl.col("color").cast(pl.String).cast(pl.Categorical),
    pl.col("clarity").cast(pl.String).cast(pl.Categorical),
)

In [4]:
import polars.selectors as cs

from src.features import target_encoding

for col in train_df.select(cs.categorical()).columns:
    train_df = target_encoding(train_df, "price", col, alpha=0.1, stats=["mean"])

train_df.head()

carat,cut,color,clarity,depth,table,x,y,z,price,cut_price_mean,color_price_mean,clarity_price_mean
f64,cat,cat,cat,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.23,"""2""","""1""","""3""",61.5,55.0,3.95,3.98,2.43,5.78996,7.654913,7.600866,8.129272
0.21,"""3""","""1""","""2""",59.8,61.0,3.89,3.84,2.31,5.78996,7.934975,7.600866,7.842485
0.23,"""1""","""1""","""4""",56.9,65.0,4.05,4.07,2.31,5.793014,7.837835,7.600866,7.731672
0.29,"""3""","""5""","""5""",62.4,58.0,4.2,4.23,2.63,5.814131,7.934975,7.999944,7.766076
0.31,"""1""","""6""","""3""",63.3,58.0,4.34,4.35,2.75,5.817111,7.837835,8.110581,8.129272


In [46]:
from lightgbm import LGBMRegressor
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

X = train_df.drop("price")
y = train_df["price"]

# Define the preprocessing for numeric and categorical features
numeric_features = X.select(cs.numeric()).columns
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = X.select(cs.categorical()).columns
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Create the pipeline with a RandomForestRegressor
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LGBMRegressor(force_row_wise=True, verbose=-1)),
    ]
)

pipeline.fit(X, y)


In [11]:
pipeline.named_steps["preprocessor"].transform(X).head()

num__carat,num__depth,num__table,num__x,num__y,num__z,num__cut_price_mean,num__color_price_mean,num__clarity_price_mean,cat__cut_0,…,cat__color_5,cat__color_6,cat__clarity_0,cat__clarity_1,cat__clarity_2,cat__clarity_3,cat__clarity_4,cat__clarity_5,cat__clarity_6,cat__clarity_7
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-1.198168,-0.174092,-1.099672,-1.587837,-1.536196,-1.571129,-1.078043,-1.272948,1.657144,0.0,…,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
-1.240361,-1.360738,1.585529,-1.641325,-1.658774,-1.741175,1.200398,-1.272948,0.266914,0.0,…,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
-1.198168,-3.385019,3.375663,-1.498691,-1.457395,-1.741175,0.410117,-1.272948,-0.270262,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
-1.071587,0.454133,0.242928,-1.364971,-1.317305,-1.28772,1.200398,1.450096,-0.103485,0.0,…,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
-1.029394,1.082358,0.242928,-1.240167,-1.212238,-1.117674,0.410117,2.205005,1.657144,0.0,…,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [8]:
pipeline.fit(X, y)
pipeline.score(X, y)

0.9927051040364384

In [45]:
import optuna
from sklearn.model_selection import cross_val_score


def objective(trial):
    # Define parameter ranges for optimization
    params = {"regressor__n_estimators": trial.suggest_int("n_estimators", 100, 1000)}

    # Create a new pipeline with the suggested parameters
    pipeline.set_params(**params)

    # Fit and return the score
    pipeline.fit(X, y)
    cvs = cross_val_score(pipeline, X, y, cv=5).mean()  # type: ignore
    return cvs


optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study()
study.optimize(objective, n_trials=10, n_jobs=-1)  # type: ignore

print(study.best_params)
print(study.best_value)


{'n_estimators': 388}
0.09538792060687706
