In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
df = pd.read_csv("data/hotel_bookings.csv")
df.shape

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
df.isnull().sum().describe()

In [None]:
df.select_dtypes(include="O").head(1)

In [None]:
lable_name = "is_canceled"
lable_name

In [None]:
df[lable_name].value_counts()

In [None]:
label_one_count = (df[lable_name] == 1).sum()
label_one_count

In [None]:
df_label_not_one = df[df[lable_name] != 1].sample(label_one_count)

In [None]:
df_under = pd.concat([df_label_not_one,  df[df[lable_name] == 1]])
df_under.shape

In [None]:
df_under[lable_name].value_counts()

In [None]:
X, y = df_under.drop(columns=[lable_name, "reservation_status", "reservation_status_date"]), df_under[lable_name]
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.1, random_state=42)

In [None]:
cat_col = X.select_dtypes(exclude="number").columns
print(cat_col)
X_train[cat_col] = X_train[cat_col].astype("category")
X_test[cat_col] = X_test[cat_col].astype("category")

* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html
* https://scikit-learn.org/stable/modules/ensemble.html#histogram-based-gradient-boosting
* https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
      (StandardScaler(),
       make_column_selector(dtype_include=np.number)),
      (OneHotEncoder(handle_unknown="ignore"),
       make_column_selector(dtype_include="category")))

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import make_pipeline

hist_pipe = make_pipeline(ct, HistGradientBoostingClassifier(random_state=42))
hist_pipe

In [None]:
# from sklearn.model_selection import cross_validate

# cv_result = cross_validate(hist_pipe, X_test, y_test, cv=3)
# cv_result

In [None]:
# fit & predict
hist_pipe.fit(X_train, y_train)

In [None]:
# ct.get_feature_names_out()

https://scikit-learn.org/stable/modules/permutation_importance.html#permutation-importance

In [None]:
from sklearn.inspection import permutation_importance

model = hist_pipe
result = permutation_importance(model, X_train, y_train,
                                n_repeats=3, random_state=0)

In [None]:
feature_names = ct.get_feature_names_out()

In [None]:
for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2 * result.importances_std[i] > 0:
        print(f"{feature_names[i]:<8}"
              f"{result.importances_mean[i]:.3f}"
              f" +/- {result.importances_std[i]:.3f}")

In [None]:
# accuracy
hgb_score = hist_pipe.score(X_test, y_test)
hgb_score

In [None]:
y_pred_hgb = hist_pipe.predict(X_test)
y_pred_hgb[:5]

In [None]:
pd.Series(y_pred_hgb).value_counts()

In [None]:
pd.crosstab(y_test, y_pred_hgb)