In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
base_path = "data"
train = pd.read_csv(f'{base_path}/train.csv.zip', index_col="ID")
test = pd.read_csv(f'{base_path}/test.csv.zip', index_col="ID")
submission = pd.read_csv(f'{base_path}/sample_submission.csv.zip', index_col="ID")

In [3]:
cat_col = train.select_dtypes(include="object").columns
train[cat_col] = train[cat_col].astype("category")
test[cat_col] = test[cat_col].astype("category")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4209 entries, 0 to 8417
Columns: 377 entries, y to X385
dtypes: category(8), float64(1), int64(368)
memory usage: 11.9 MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4209 entries, 1 to 8416
Columns: 376 entries, X0 to X385
dtypes: category(8), int64(368)
memory usage: 11.9 MB


In [6]:
X, y = train.drop(columns="y"), train["y"]
X.shape, y.shape

((4209, 376), (4209,))

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.1, random_state=42)

* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html
* https://scikit-learn.org/stable/modules/ensemble.html#histogram-based-gradient-boosting
* https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
      (StandardScaler(),
       make_column_selector(dtype_include=np.number)),
      (OneHotEncoder(handle_unknown="ignore"),
       make_column_selector(dtype_include="category")))

# X_train = ct.fit_transform(X_train_raw)
# X_valid = ct.transform(X_valid_raw)

# X_train.shape, X_valid.shape

In [9]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline

hist_pipe = make_pipeline(ct, HistGradientBoostingRegressor(random_state=42))
hist_pipe

In [10]:
from sklearn.model_selection import cross_validate

scoring = "neg_mean_absolute_percentage_error"
n_cv_folds = 3

cv_result = cross_validate(hist_pipe, X_valid, y_valid, cv=n_cv_folds, scoring=scoring)
cv_result

{'fit_time': array([1.03315806, 1.03112292, 0.86310601]),
 'score_time': array([0.01162982, 0.00973701, 0.01069498]),
 'test_score': array([-0.05479039, -0.06301914, -0.06175187])}

In [11]:
# fit & predict
hist_pipe.fit(X_train, y_train)

In [12]:
# ct.get_feature_names_out()

https://scikit-learn.org/stable/modules/permutation_importance.html#permutation-importance

In [13]:
from sklearn.inspection import permutation_importance

model = hist_pipe
result = permutation_importance(model, X_train, y_train,
                                n_repeats=3, random_state=0)

In [14]:
feature_names = ct.get_feature_names_out()

In [15]:
# for i in result.importances_mean.argsort()[::-1]:
#     if result.importances_mean[i] - 2 * result.importances_std[i] > 0:
#         print(f"{feature_names[i]:<8}"
#               f"{result.importances_mean[i]:.3f}"
#               f" +/- {result.importances_std[i]:.3f}")

In [16]:
hgbr_score = hist_pipe.score(X_valid, y_valid)
hgbr_score

0.5722766050774675

In [17]:
y_pred_hgbr = hist_pipe.predict(test)
y_pred_hgbr[:5]

array([ 76.99591411,  92.18224817,  77.30829545,  75.78294454,
       111.97681194])

In [18]:
submission["y"] = y_pred_hgbr

https://www.kaggle.com/competitions/mercedes-benz-greener-manufacturing/submissions

In [19]:
file_name = f"submit_hgbr_category_{hgbr_score:.5f}.csv"
file_name

'submit_hgbr_category_0.57228.csv'

In [20]:
submission.to_csv(file_name)
pd.read_csv(file_name).head(2)

Unnamed: 0,ID,y
0,1,76.995914
1,2,92.182248
