In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
base_path = "data"
train = pd.read_csv(f'{base_path}/train.csv.zip', index_col="ID")
test = pd.read_csv(f'{base_path}/test.csv.zip', index_col="ID")
submission = pd.read_csv(f'{base_path}/sample_submission.csv.zip', index_col="ID")

In [3]:
cat_col = train.select_dtypes(include="object").columns
train[cat_col] = train[cat_col].astype("category")
test[cat_col] = test[cat_col].astype("category")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4209 entries, 0 to 8417
Columns: 377 entries, y to X385
dtypes: category(8), float64(1), int64(368)
memory usage: 11.9 MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4209 entries, 1 to 8416
Columns: 376 entries, X0 to X385
dtypes: category(8), int64(368)
memory usage: 11.9 MB


In [6]:
X, y = train.drop(columns="y"), train["y"]
X.shape, y.shape

((4209, 376), (4209,))

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.1, random_state=42)

* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html
* https://scikit-learn.org/stable/modules/ensemble.html#histogram-based-gradient-boosting
* https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html

In [8]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

dropper = make_column_transformer(
    ("drop", make_column_selector(dtype_include="category")), remainder="passthrough"
)
hist_dropped = make_pipeline(dropper, HistGradientBoostingRegressor(random_state=42))
hist_dropped

In [9]:
from sklearn.model_selection import cross_validate

scoring = "neg_mean_absolute_percentage_error"
n_cv_folds = 3

dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
dropped_result

{'fit_time': array([1.90482211, 1.80982304, 1.5913527 ]),
 'score_time': array([0.00770378, 0.007725  , 0.00758815]),
 'test_score': array([-0.05001658, -0.05351378, -0.05726412])}

In [10]:
# fit & predict
hist_dropped.fit(X_train, y_train)

https://scikit-learn.org/stable/modules/permutation_importance.html#permutation-importance

In [15]:
from sklearn.inspection import permutation_importance

model = hist_dropped
result = permutation_importance(model, X_train, y_train,
                                n_repeats=3, random_state=0)
result

{'importances_mean': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         5.76906523e-05,  0.00000000e+00,  3.63042874e-03,  8.48859521e-04,
         3.31730126e-03,  0.00000000e+00,  0.00000000e+00,  1.10216617e-05,
         2.22744900e-04,  1.38239627e-03,  6.71191044e-04,  0.00000000e+00,
         8.51003689e-04,  2.84241050e-04,  0.00000000e+00,  0.00000000e+00,
         5.88580564e-03,  4.31457143e-04,  1.14293905e-01,  0.00000000e+00,
         7.58608776e-03,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  4.10973128e-03,
         0.00000000e+00,  0.00000000e+00,  1.04252975e-04,  0.00000000e+00,
         4.11091523e-04,  8.20173561e-06,  8.87395999e-04,  4.23422275e-03,
         9.24722326e-03,  7.28744806e-04,  5.33400501e-05,  3.11114651e-03,
         4.83235369e-03,  1.34866118e-03,  0.00000000e+00,  0.000000

In [16]:
result.keys()

dict_keys(['importances_mean', 'importances_std', 'importances'])

In [13]:
pd.DataFrame(r[""])

ValueError: Per-column arrays must each be 1-dimensional

In [None]:
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{diabetes.feature_names[i]:<8}"
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")

In [None]:
hgbr_score = hist_dropped.score(X_valid, y_valid)
hgbr_score

In [None]:
y_pred_hgbr = hist_dropped.predict(test)
y_pred_hgbr[:5]

In [None]:
submission["y"] = y_pred_hgbr

https://www.kaggle.com/competitions/mercedes-benz-greener-manufacturing/submissions

In [None]:
file_name = f"submit_hgbr_category_{hgbr_score:.5f}.csv"
file_name

In [None]:
submission.to_csv(file_name)
pd.read_csv(file_name).head(2)