* https://www.sciencedirect.com/science/article/pii/S2352340918315191
* https://www.kaggle.com/marcuswingen/eda-of-bookings-and-ml-to-predict-cancelations


<img src="https://ars.els-cdn.com/content/image/1-s2.0-S2352340918315191-gr1.jpg">

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
file_path = "data/hotel_bookings.csv"
df = pd.read_csv(file_path)
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
label_name = "is_canceled"

In [None]:
label_count = df[label_name].value_counts()
label_count

In [None]:
df_0 = df[df[label_name] != 1].sample(label_count[1])
df_1 = df[df[label_name] == 1]

df_under = pd.concat([df_0, df_1])
df_under[label_name].value_counts()

In [None]:
X, y = df.drop(columns=[label_name, "reservation_status", "reservation_status_date"]), df[label_name] == 1
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid_raw, y_train, y_valid_raw = train_test_split(
    X, y, stratify=y, test_size=0.1, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_valid_raw, y_valid_raw, stratify=y_valid_raw, test_size=0.1, random_state=42)

In [None]:
X.select_dtypes(exclude="number").nunique()

In [None]:
cat_col = X.select_dtypes(exclude="number").columns
print(cat_col)
X_train[cat_col] = X_train[cat_col].astype("category")
X_test[cat_col] = X_test[cat_col].astype("category")

In [None]:
# !pip install lightgbm

In [None]:
import lightgbm 
        
model_lgbm = lightgbm.LGBMClassifier(boosting_type='goss', 
                                    n_estimators=1000, max_depth=3,
                                    early_stopping_rounds=5,
                                    n_jobs=-1,
                                    learning_rate=0.01, random_state=42)
model_lgbm

In [None]:
# fit & predict
model_lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)],
        callbacks=[lightgbm.log_evaluation(1)])

In [None]:
model_lgbm.best_score_

In [None]:
lightgbm.plot_importance(model_lgbm, max_num_features=20)

In [None]:
lightgbm.plot_tree(model_lgbm, figsize=(20, 20), tree_index=0,
                   show_info=['split_gain', 'internal_value', 'internal_count', 'leaf_count']
                   )

In [None]:
y_pred = model_lgbm.predict(X_test)

In [None]:
(y_test == y_pred).mean()

In [None]:
pd.crosstab(y_test, y_pred)