In [1]:
import warnings

import polars as pl
from sklearn import set_config

warnings.filterwarnings("ignore")

_ = pl.Config.set_tbl_rows(5)
_ = pl.Config.set_tbl_cols(20)

set_config(transform_output="polars")

In [2]:
from src.datasets import load_dataset

df = load_dataset("diamonds")

df

carat,cut,color,clarity,depth,table,x,y,z,price
f64,i64,i64,i64,f64,f64,f64,f64,f64,f64
0.23,2,1,3,61.5,55.0,3.95,3.98,2.43,5.78996
0.21,3,1,2,59.8,61.0,3.89,3.84,2.31,5.78996
0.23,1,1,4,56.9,65.0,4.05,4.07,2.31,5.793014
…,…,…,…,…,…,…,…,…,…
0.86,3,4,3,61.0,58.0,6.15,6.12,3.74,7.922261
0.75,2,0,3,62.2,55.0,5.83,5.87,3.64,7.922261


In [3]:
from src.features import target_encoding

df = target_encoding(df, "price", "cut")

df


carat,cut,color,clarity,depth,table,x,y,z,price,cut_target_encoded
f64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64
0.23,2,1,3,61.5,55.0,3.95,3.98,2.43,5.78996,7.640189
0.21,3,1,2,59.8,61.0,3.89,3.84,2.31,5.78996,7.951369
0.23,1,1,4,56.9,65.0,4.05,4.07,2.31,5.793014,7.843436
…,…,…,…,…,…,…,…,…,…,…
0.86,3,4,3,61.0,58.0,6.15,6.12,3.74,7.922261,7.951369
0.75,2,0,3,62.2,55.0,5.83,5.87,3.64,7.922261,7.640189


In [3]:
from lightgbm import LGBMRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define the preprocessing for numeric and categorical features
numeric_features = ["carat", "depth", "table", "x", "y", "z"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["cut", "color", "clarity"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Create the pipeline with a RandomForestRegressor
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LGBMRegressor(force_row_wise=True, verbose=-1)),
    ]
)

# Fit the pipeline to the data
X = df.drop("price")
y = df["price"]
pipeline.fit(X, y)


In [4]:
pipeline.named_steps["preprocessor"].transform(X)

num__carat,num__depth,num__table,num__x,num__y,num__z,cat__cut_0,cat__cut_1,cat__cut_2,cat__cut_3,…,cat__color_5,cat__color_6,cat__clarity_0,cat__clarity_1,cat__clarity_2,cat__clarity_3,cat__clarity_4,cat__clarity_5,cat__clarity_6,cat__clarity_7
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-1.198168,-0.174092,-1.099672,-1.587837,-1.536196,-1.571129,0.0,0.0,1.0,0.0,…,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
-1.240361,-1.360738,1.585529,-1.641325,-1.658774,-1.741175,0.0,0.0,0.0,1.0,…,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
-1.198168,-3.385019,3.375663,-1.498691,-1.457395,-1.741175,0.0,1.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.130927,-0.523105,0.242928,0.373383,0.337506,0.285204,0.0,0.0,0.0,1.0,…,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
-0.101137,0.314528,-1.099672,0.088115,0.118616,0.143499,0.0,0.0,1.0,0.0,…,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [5]:
import optuna


def objective(trial):
    x = trial.suggest_float("x", -10, 10)
    return (x - 2) ** 2


optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study()
study.optimize(objective, n_trials=100, n_jobs=8)

study.best_params


{'x': 1.9801966676480007}

In [9]:
pipeline.named_steps["regressor"]