https://www.kaggle.com/competitions/mercedes-benz-greener-manufacturing/overview

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# !unzip mercedes-benz-greener-manufacturing.zip -d data

In [None]:
base_path = "data"
train = pd.read_csv(f'{base_path}/train.csv.zip', index_col="ID")
test = pd.read_csv(f'{base_path}/test.csv.zip', index_col="ID")
submission = pd.read_csv(f'{base_path}/sample_submission.csv.zip', index_col="ID")

In [None]:
cat_col = train.select_dtypes(include="object").columns
train[cat_col] = train[cat_col].astype("category")
test[cat_col] = test[cat_col].astype("category")

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train["y"].describe()

In [None]:
train["y"].hist()

In [None]:
# 이상치 제거
train = train[train["y"] < 250]
train.shape

In [None]:
X, y = train.drop(columns="y"), train["y"]
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.1, random_state=42)

## HistGradientBoosting
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html
* https://scikit-learn.org/stable/modules/ensemble.html#histogram-based-gradient-boosting
* https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
      (StandardScaler(),
       make_column_selector(dtype_include=np.number)),
      (OneHotEncoder(handle_unknown="ignore"),
       make_column_selector(dtype_include="category")))

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline

hist_pipe = make_pipeline(ct, HistGradientBoostingRegressor(random_state=42))
hist_pipe

In [None]:
from sklearn.model_selection import cross_validate

scoring = "neg_mean_absolute_percentage_error"
n_cv_folds = 3

cv_result = cross_validate(hist_pipe, X_valid, y_valid, cv=n_cv_folds, scoring=scoring)
cv_result

In [None]:
# fit & predict
hist_pipe.fit(X_train, y_train)

In [None]:
# ct.get_feature_names_out()

https://scikit-learn.org/stable/modules/permutation_importance.html#permutation-importance

In [None]:
from sklearn.inspection import permutation_importance

model = hist_pipe
result = permutation_importance(model, X_train, y_train,
                                n_repeats=3, random_state=0)

In [None]:
feature_names = ct.get_feature_names_out()

In [None]:
# for i in result.importances_mean.argsort()[::-1]:
#     if result.importances_mean[i] - 2 * result.importances_std[i] > 0:
#         print(f"{feature_names[i]:<8}"
#               f"{result.importances_mean[i]:.3f}"
#               f" +/- {result.importances_std[i]:.3f}")

In [None]:
hgbr_score = hist_pipe.score(X_valid, y_valid)
hgbr_score

In [None]:
y_pred_hgbr = hist_pipe.predict(test)
y_pred_hgbr[:5]

In [None]:
submission["y"] = y_pred_hgbr

## submit
* https://www.kaggle.com/competitions/mercedes-benz-greener-manufacturing/submissions

* 대회 측정 지표 : 결정계수(R-squared 또는 R^2)는 회귀분석에서 사용되는 통계적 척도로, 종속 변수의 총 변동 중에서 독립 변수에 의해 설명되는 변동의 비율을 나타냅니다. 즉, 회귀모델이 얼마나 데이터를 잘 설명하고 있는지를 나타내는 지표입니다.
    * R^2 = 1 - (SSR/SST)
    * SSR(Residual Sum of Squares): 잔차 제곱합입니다. 이는 실제 값과 예측 값의 차이를 제곱하여 합산한 값입니다. 즉, 모델이 예측한 값이 실제 값과 얼마나 차이나는지를 나타내는 지표입니다. 이 값이 작을수록 모델의 예측이 실제 값과 가깝다는 것을 의미합니다.
    * SST(Total Sum of Squares): 총 제곱합입니다. 이는 실제 값과 실제 값의 평균의 차이를 제곱하여 합산한 값입니다. 즉, 실제 값이 얼마나 분산되어 있는지를 나타내는 지표입니다.

In [None]:
file_name = f"submit_hgbr_category_{hgbr_score:.5f}.csv"
file_name

In [None]:
submission.to_csv(file_name)
pd.read_csv(file_name).head(2)