In [1]:
import numpy as np
import pandas as pd
from category_encoders import *
from sklearn.compose import *
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, IsolationForest
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import *
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from sklearn.metrics import make_scorer, balanced_accuracy_score, f1_score,  precision_score, recall_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import *
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
booking_data = pd.read_csv("hotel_reservations.csv")

In [3]:
y = booking_data["is_canceled"] # extract target

In [4]:
X = booking_data.drop(["is_canceled"],
                      axis=1) 

In [5]:
cat_columns = X.dtypes==object
con_columns = ~cat_columns

In [6]:
cat_columns [["company", "agent", "reservation_status", "reservation_status_date"]] = False
con_columns [["company", "agent", "reservation_status", "reservation_status_date"]] = False

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
cat_pipe = Pipeline([("cat_imputer", SimpleImputer(missing_values=np.nan,
                                              strategy="most_frequent")),
                     ("ohe", OneHotEncoder(handle_unknown="ignore"))])

con_pipe = Pipeline([("con_imputer", SimpleImputer(missing_values=np.nan,
                                                  strategy="median")),
                    ("scaler", StandardScaler())])

preprocessing = ColumnTransformer([("categorical", cat_pipe, cat_columns),
                                  ("continuous", con_pipe, con_columns)],
                                 remainder="drop")


In [9]:
pipe = Pipeline([("preprocessing", preprocessing),
                ("rf", RandomForestClassifier())])

In [10]:
rf_hyperparams = {"rf__n_estimators": [10,20,50], # number of trees used in random forest, very high values could lead to overfitting
                 "rf__max_depth": [5, 10,15], # max depth of each tree, if the depth is too low, the accuracy is poor
                 "rf__criterion": ["gini", "entropy"], # to check whether impurity or information gain is the best way to split
                 "rf__min_samples_leaf": [3,5,10], # minimum samples beyond which a node cannot be split, higher values imply more generality
                 "rf__max_features": ["sqrt", "log2"], # to check what is the best way limit the number of features to each tree
                 "rf__bootstrap": [True, False]} # to check whether bagging and aggregating results in a better model 


In [11]:
f1_wtd = make_scorer(f1_score, average="weighted")
pr_wtd = make_scorer(precision_score, average="weighted")
bal_acc_score = make_scorer(balanced_accuracy_score)
recall_wtd = make_scorer(recall_score, average = "weighted")

scoring_dict = {"bal_acc_score": bal_acc_score,
            "f1_wtd": f1_wtd,
            "pr_wtd": pr_wtd,
            "recall_wtd": recall_wtd}

In [12]:
rscv = RandomizedSearchCV(estimator=pipe,
                    param_distributions=rf_hyperparams,
                    n_iter = 10,
                    scoring = scoring_dict,
                    refit = "bal_acc_score",
                    n_jobs = -1,
                    cv = 5,
                    random_state=42)

In [13]:
result = rscv.fit(X_train, y_train)



In [14]:
print(result.best_params_)
print(result.best_score_)

{'rf__n_estimators': 10, 'rf__min_samples_leaf': 5, 'rf__max_features': 'sqrt', 'rf__max_depth': 15, 'rf__criterion': 'entropy', 'rf__bootstrap': False}
0.7875068685106845


In [15]:
pipe = result.best_estimator_
model = pipe.fit(X_train, y_train)

In [16]:
# Print model hyper parameters
model.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  hotel                              True
lead_time                         False
arrival_date_year                 False
arrival_date_month                 True
arrival_date_week_number          False
arrival_date_day_of_month         False
stays_in_weekend_nights...
deposit_type                      False
agent                             False
company                           False
days_in_waiting_list               True
customer_typ

In [17]:
y_pred = model.predict(X_test)

In [18]:
print(f"Balanced accuracy score: {balanced_accuracy_score(y_test, y_pred): .4f}")
print(f"Precision score: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall score: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

Balanced accuracy score:  0.7951
Precision score: 0.8521
Recall score: 0.8390
F1 score: 0.8305
