In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
base_path = "data"
train = pd.read_csv(f'{base_path}/train.csv.zip', index_col="ID")
test = pd.read_csv(f'{base_path}/test.csv.zip', index_col="ID")
submission = pd.read_csv(f'{base_path}/sample_submission.csv.zip', index_col="ID")

In [3]:
cat_col = train.select_dtypes(include="object").columns
train[cat_col] = train[cat_col].astype("category")
test[cat_col] = test[cat_col].astype("category")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4209 entries, 0 to 8417
Columns: 377 entries, y to X385
dtypes: category(8), float64(1), int64(368)
memory usage: 11.9 MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4209 entries, 1 to 8416
Columns: 376 entries, X0 to X385
dtypes: category(8), int64(368)
memory usage: 11.9 MB


In [6]:
X, y = train.drop(columns="y"), train["y"]
X.shape, y.shape

((4209, 376), (4209,))

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.1, random_state=42)

* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html
* https://scikit-learn.org/stable/modules/ensemble.html#histogram-based-gradient-boosting
* https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html

In [8]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

dropper = make_column_transformer(
    ("drop", make_column_selector(dtype_include="category")), remainder="passthrough"
)
hist_dropped = make_pipeline(dropper, HistGradientBoostingRegressor(random_state=42))
hist_dropped

In [9]:
from sklearn.model_selection import cross_validate

scoring = "neg_mean_absolute_percentage_error"
n_cv_folds = 3

dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
dropped_result

{'fit_time': array([2.24631691, 2.31711411, 2.12450528]),
 'score_time': array([0.01309085, 0.01051807, 0.00829077]),
 'test_score': array([-0.05001658, -0.05351378, -0.05726412])}

In [10]:
# fit & predict
hist_dropped.fit(X_train, y_train)

In [11]:
dropper.get_feature_names_out()

array(['remainder__X10', 'remainder__X11', 'remainder__X12',
       'remainder__X13', 'remainder__X14', 'remainder__X15',
       'remainder__X16', 'remainder__X17', 'remainder__X18',
       'remainder__X19', 'remainder__X20', 'remainder__X21',
       'remainder__X22', 'remainder__X23', 'remainder__X24',
       'remainder__X26', 'remainder__X27', 'remainder__X28',
       'remainder__X29', 'remainder__X30', 'remainder__X31',
       'remainder__X32', 'remainder__X33', 'remainder__X34',
       'remainder__X35', 'remainder__X36', 'remainder__X37',
       'remainder__X38', 'remainder__X39', 'remainder__X40',
       'remainder__X41', 'remainder__X42', 'remainder__X43',
       'remainder__X44', 'remainder__X45', 'remainder__X46',
       'remainder__X47', 'remainder__X48', 'remainder__X49',
       'remainder__X50', 'remainder__X51', 'remainder__X52',
       'remainder__X53', 'remainder__X54', 'remainder__X55',
       'remainder__X56', 'remainder__X57', 'remainder__X58',
       'remainder__X59',

https://scikit-learn.org/stable/modules/permutation_importance.html#permutation-importance

In [12]:
from sklearn.inspection import permutation_importance

model = hist_dropped
result = permutation_importance(model, X_train, y_train,
                                n_repeats=3, random_state=0)

In [13]:
feature_names = dropper.get_feature_names_out()

In [14]:
# for i in result.importances_mean.argsort()[::-1]:
#     if result.importances_mean[i] - 2 * result.importances_std[i] > 0:
#         print(f"{feature_names[i]:<8}"
#               f"{result.importances_mean[i]:.3f}"
#               f" +/- {result.importances_std[i]:.3f}")

In [15]:
hgbr_score = hist_dropped.score(X_valid, y_valid)
hgbr_score

0.574719060141833

In [16]:
y_pred_hgbr = hist_dropped.predict(test)
y_pred_hgbr[:5]

array([ 77.95152062,  92.91676481,  77.2406661 ,  75.75973602,
       112.03576608])

In [17]:
submission["y"] = y_pred_hgbr

https://www.kaggle.com/competitions/mercedes-benz-greener-manufacturing/submissions

In [18]:
file_name = f"submit_hgbr_category_{hgbr_score:.5f}.csv"
file_name

'submit_hgbr_category_0.57472.csv'

In [19]:
submission.to_csv(file_name)
pd.read_csv(file_name).head(2)

Unnamed: 0,ID,y
0,1,77.951521
1,2,92.916765
