In [23]:
import pandas as pd
from phik import phik_matrix

df = pd.read_csv(filepath_or_buffer='hotel_bookings_cleaned.csv')

---

# **Data Preprocessing**

## Correlation

In [24]:
unused_columns = ['total_stays_in_nights', 'total_guests', 'revenue']

# Computing pairwise correlation of numerical variables
df.drop(columns=unused_columns)\
      .corr(method='spearman', # Using Spearman's method for non-parametric variables
            numeric_only=True)\
      .style.background_gradient(cmap=None).format(formatter='{:.3f}') # Adding colors & rounding numbers displayed

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
is_canceled,1.0,0.225,0.087,-0.0,0.005,0.059,0.095,0.089,0.061,-0.021,-0.09,0.126,-0.101,-0.125,0.015,0.141,-0.186,-0.129
lead_time,0.225,1.0,0.136,0.097,0.012,0.292,0.415,0.228,0.05,-0.002,-0.22,0.0,-0.191,0.056,0.098,0.101,-0.11,0.061
arrival_date_year,0.087,0.136,1.0,-0.516,-0.011,0.009,0.018,0.058,0.043,-0.023,0.023,-0.109,0.033,-0.005,-0.029,0.189,-0.041,0.066
arrival_date_week_number,-0.0,0.097,-0.516,1.0,0.088,0.036,0.041,0.025,0.017,0.017,-0.039,0.047,-0.047,0.018,-0.0,0.114,0.01,0.044
arrival_date_day_of_month,0.005,0.012,-0.011,0.088,1.0,-0.009,-0.017,0.003,0.017,0.0,-0.004,-0.004,-0.001,0.005,0.012,0.019,0.008,-0.003
stays_in_weekend_nights,0.059,0.292,0.009,0.036,-0.009,1.0,0.326,0.128,0.035,0.018,-0.121,-0.028,-0.108,0.021,-0.039,0.044,-0.044,0.035
stays_in_week_nights,0.095,0.415,0.018,0.041,-0.017,0.326,1.0,0.17,0.045,0.024,-0.15,-0.031,-0.135,0.051,0.006,0.081,-0.051,0.054
adults,0.089,0.228,0.058,0.025,0.003,0.128,0.17,1.0,0.068,0.027,-0.219,-0.087,-0.238,-0.075,-0.034,0.335,0.013,0.157
children,0.061,0.05,0.043,0.017,0.017,0.035,0.045,0.068,1.0,0.032,-0.049,-0.03,-0.05,0.047,-0.029,0.284,0.041,0.055
babies,-0.021,-0.002,-0.023,0.017,0.0,0.018,0.024,0.027,0.032,1.0,-0.014,-0.007,-0.016,0.117,-0.007,0.021,0.033,0.092


Notes:
* If there are no repeated/duplicated data values, a perfect Spearman correlation of +1 or −1 occurs when each of the variables is a perfect monotone (linear) function of the other. If Y variable tends to increase when X variable increases, the Spearman correlation coefficient is positive —and vice versa for negative coefficient.
* Pairing of variables with moderate to high positive correlation value (0.33 - 1) are:
    * `is_repeated_guest` & `previous_bookings_not_canceled`
    * `previous_cancellations` & `previous_bookings_not_canceled`

Correlated variables provide redundant information and reducing prediction model's ability to generalize (causing overfitting). Therefore, **variable which causes this correlation will not be used: `previous_bookings_not_canceled`**.

* adults - adr = 0.34  
    * moderately correlated?  
    * which one to drop? adr?
    * total_guests = medium - high correlation with others
* arrival_date_week_number = drop
* lead_time - stays_in_week_nights?
    * total_stays = also medium - high?

In [25]:
# Selecting categorical variables as subset
categorical_target_vars = df.select_dtypes(include='object')\
                            .combine_first(other=df[['is_canceled']]) # Combining subset with target variable

# Computing pairwise correlation of categorical & target variables
phik_matrix(df=categorical_target_vars, 
            # Applying target as interval column/variable
            interval_cols='is_canceled')\
            .style.background_gradient(cmap=None).format(formatter='{:.3f}')

Unnamed: 0,arrival_date_month,assigned_room_type,country,customer_type,deposit_type,distribution_channel,hotel,is_canceled,market_segment,meal,reservation_status,reservation_status_date,reserved_room_type
arrival_date_month,1.0,0.095,0.262,0.213,0.114,0.125,0.076,0.108,0.168,0.147,0.151,0.977,0.11
assigned_room_type,0.095,1.0,0.139,0.175,0.135,0.146,0.477,0.12,0.171,0.346,0.148,0.219,0.94
country,0.262,0.139,1.0,0.23,0.226,0.316,0.445,0.235,0.401,0.232,0.322,0.505,0.163
customer_type,0.213,0.175,0.23,1.0,0.053,0.111,0.094,0.193,0.453,0.194,0.097,0.493,0.152
deposit_type,0.114,0.135,0.226,0.053,1.0,0.049,0.034,0.1,0.304,0.034,0.342,0.581,0.081
distribution_channel,0.125,0.146,0.316,0.111,0.049,1.0,0.111,0.123,0.772,0.115,0.147,0.333,0.224
hotel,0.076,0.477,0.445,0.094,0.034,0.111,1.0,0.108,0.164,0.467,0.043,0.26,0.408
is_canceled,0.108,0.12,0.235,0.193,0.1,0.123,0.108,1.0,0.206,0.082,1.0,0.367,0.072
market_segment,0.168,0.171,0.401,0.453,0.304,0.772,0.164,0.206,1.0,0.231,0.234,0.477,0.21
meal,0.147,0.346,0.232,0.194,0.034,0.115,0.467,0.082,0.231,1.0,0.043,0.321,0.253


Notes:
* Phi_K library is used to compute correlation between categorical and interval variables.
    * Based on Pearson's chi-squared contingency test —a hypothesis test for independence between two or more variables.
    * `is_canceled` is added into this analysis because we would want to find the association between categorical variables and the target of this prediction model, and also applied as interval variable because the difference between values in this column (`1` or `0`) is meaningful.
    * Phi_K correlation matrix drops missing value from the data to calculate the correlation coefficient by default (parameter `dropna` already set to `True`).
    * It captures any non-linear relationship between variables, and has a built-in noise reduction against statistical fluctuations or outliers (parameter `noise_correction` also set to `True` by default).
    * Correlation coefficient value is between 0 and 1, hence no indication of relationship direction between variables.
* There are plenty of variables pairing with moderate to high correlation value (0.33 - 1).
    * First step of this analysis segment will drop correlated variables with target `is_canceled`:
        * `agent`
        * `company`
        * `reservation_status`
        * `reservation_status_date` 

In [26]:
# Dropping variables correlated with target
categorical_target_vars.drop(columns=['reservation_status', 'reservation_status_date'], inplace=True)

# Verifying process
phik_matrix(df=categorical_target_vars, interval_cols='is_canceled').style.background_gradient(cmap=None).format(formatter='{:.3f}')

Unnamed: 0,arrival_date_month,assigned_room_type,country,customer_type,deposit_type,distribution_channel,hotel,is_canceled,market_segment,meal,reserved_room_type
arrival_date_month,1.0,0.095,0.262,0.213,0.114,0.125,0.076,0.108,0.168,0.147,0.11
assigned_room_type,0.095,1.0,0.139,0.175,0.135,0.146,0.477,0.12,0.171,0.346,0.94
country,0.262,0.139,1.0,0.23,0.226,0.316,0.445,0.235,0.401,0.232,0.163
customer_type,0.213,0.175,0.23,1.0,0.053,0.111,0.094,0.193,0.453,0.194,0.152
deposit_type,0.114,0.135,0.226,0.053,1.0,0.049,0.034,0.1,0.304,0.034,0.081
distribution_channel,0.125,0.146,0.316,0.111,0.049,1.0,0.111,0.123,0.772,0.115,0.224
hotel,0.076,0.477,0.445,0.094,0.034,0.111,1.0,0.108,0.164,0.467,0.408
is_canceled,0.108,0.12,0.235,0.193,0.1,0.123,0.108,1.0,0.206,0.082,0.072
market_segment,0.168,0.171,0.401,0.453,0.304,0.772,0.164,0.206,1.0,0.231,0.21
meal,0.147,0.346,0.232,0.194,0.034,0.115,0.467,0.082,0.231,1.0,0.253


Notes:
* Number of pairings of variables with moderate to high correlation are listed below:
    * `hotel`: 4 pairings
    * `assigned_room_type` and `market_segment`: 3 pairings
    * `country`, `meal`, and `reserved_room_type`: 2 pairings
    * `customer_type` and `distribution_channel`: 1 pairing
* To reduce this number, the second step of this segment will drop top 3 variables with the highest amount of pairings:
    * `hotel`
    * `assigned_room_type`
    * `market_segment`

In [27]:
# Dropping top 3 variables with the highest amount of correlation pairings
categorical_target_vars.drop(columns=['hotel', 'assigned_room_type', 'market_segment'], inplace=True)

# Verifying process
phik_matrix(df=categorical_target_vars, interval_cols='is_canceled').style.background_gradient(cmap=None).format(formatter='{:.3f}')

Unnamed: 0,arrival_date_month,country,customer_type,deposit_type,distribution_channel,is_canceled,meal,reserved_room_type
arrival_date_month,1.0,0.262,0.213,0.114,0.125,0.108,0.147,0.11
country,0.262,1.0,0.23,0.226,0.316,0.235,0.232,0.163
customer_type,0.213,0.23,1.0,0.053,0.111,0.193,0.194,0.152
deposit_type,0.114,0.226,0.053,1.0,0.049,0.1,0.034,0.081
distribution_channel,0.125,0.316,0.111,0.049,1.0,0.123,0.115,0.224
is_canceled,0.108,0.235,0.193,0.1,0.123,1.0,0.082,0.072
meal,0.147,0.232,0.194,0.034,0.115,0.082,1.0,0.253
reserved_room_type,0.11,0.163,0.152,0.081,0.224,0.072,0.253,1.0


## Outlier

In [28]:
# Defining function for detecting outlier
def detect_outlier(variable):
    Q1, Q3 = df[variable].quantile(q=0.25), df[variable].quantile(q=0.75)
    IQR = Q3 - Q1
    lower_boundary, upper_boundary = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    outliers = df.loc[(df[variable] <  (lower_boundary)) | (df[variable] > (upper_boundary))]
    return outliers

Notes:

* Outlier detection uses adjusted box-plot method:
    * Used for data with non-normal (skewed) distribution.
    * Q1 and Q3 are defined as 25th and 75th percentile of the data, respectively.
    * Interquartile range (IQR) as the distance between Q1 and Q3.
    * Boundaries for this method are defined as 1.5 of IQR for both side, and
    * Data points outside these boundaries are flagged as outliers.

In [29]:
numeric_vars = df.select_dtypes(include='number')

# Showing summary of outlier in numerical variables
pd.DataFrame(data={'numeric_variables': numeric_vars.columns,
                   'n': [detect_outlier(var).shape[0] for var in numeric_vars],
                   '%': [round(number=detect_outlier(var).shape[0] / df.shape[0] * 100, ndigits=2) for var in numeric_vars],
                   'sample': [sorted(detect_outlier(var)[var].unique()) for var in numeric_vars],
                   'min': [detect_outlier(var)[var].min() for var in numeric_vars],
                   'max': [detect_outlier(var)[var].max() for var in numeric_vars]})

Unnamed: 0,numeric_variables,n,%,sample,min,max
0,is_canceled,0,0.0,[],,
1,lead_time,2470,2.84,"[295, 296, 297, 298, 299, 300, 301, 302, 303, ...",295.0,737.0
2,arrival_date_year,0,0.0,[],,
3,arrival_date_week_number,0,0.0,[],,
4,arrival_date_day_of_month,0,0.0,[],,
5,stays_in_weekend_nights,218,0.25,"[6, 7, 8, 9, 10, 12, 13, 14, 16]",6.0,16.0
6,stays_in_week_nights,1522,1.75,"[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20...",9.0,41.0
7,total_stays_in_nights,3001,3.45,"[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2...",10.0,57.0
8,adults,22600,25.99,"[0, 1, 3, 4, 5, 6, 10, 20, 26, 27, 40, 50, 55]",0.0,55.0
9,children,8353,9.61,"[1, 2, 3, 10]",1.0,10.0


Notes:

* Depending on the context, outliers either (1) deserve special attention or (2) should be completely ignored. 
    1. Take an example of `lead_time`: if unusual number of days are observed, it may be a good idea to pay extra attention to them and figure out whether these values are sensible or not (based on the data description). So in instances like these, it is useful to investigate the outlier values and handling them accordingly.
    2. If outliers are however, introduced due to mechanical error, measurement error or anything else that cannot be generalized, it is a good idea to remove these outliers before feeding the data to the modeling algorithm —because some algorithms are sensitive to outliers.
* Tree-based algorithms are almost the only algorithms that are not affected by the magnitude of the input, as we can easily see from how trees are built.  When deciding how to make a split, tree algorithm look for decisions like "whether feature value X > a certain value" and compute the purity of the child node after the split, so the scale of the feature does not count.

| Variable | Insights |
| --- | --- |
| lead_time | customer bisa aja booking dari jauh hari; misal booking pada tahun x untuk menginap tahun x + 2 sehingga `lead_time` = ~730 |
| stays_in_weekend_nights | customer bisa aja booking untuk menginap selama 19 hari weekend |
| stays_in_week_nights | customer bisa aja booking untuk menginap selama 50 hari weekday |
| adults | customer bisa memesan untuk rombongan yang tidak berisi orang dewasa (`adults` = 0)? |
| | customer bisa aja memesan untuk rombongan yang berisi 55 orang dewasa |
| children | customer bisa aja memesan untuk rombongan yang berisi 10 orang remaja |
| babies | customer bisa aja memesan untuk rombongan yang berisi 10 orang anak-anak |
| previous_cancellations | customer bisa aja memiliki riwayat pembatalan booking hingga 26x |
| previous_bookings_not_canceled | menandakan bahwa customer adalah pelanggan yang rutin booking Hotel (hingga memiliki riwayat 72x booking) |
| booking_changes | customer bisa aja memiliki riwayat mengubah spesifikasi di dalam booking hingga 21x, bahkan hingga booking tersebut dibatalkan |
| days_in_waiting_list | customer bisa menunggu hingga ~365 hari (1 tahun) hingga pemesanannya disetujui oleh pihak hotel? |
| adr | menandakan bahwa pendapatan harian hotel bisa berfluktuatif hingga 5.400 satuan moneter per hari |
| required_car_parking_spaces | customer bisa aja memesan untuk rombongan yang membutuhkan 8 tempat parkir |
| total_of_special_requests | customer bisa aja memiliki 5 permintaan khusus dalam pemesanannya |

Notes:

* `previous_cancellations` tinggi (outlier) belum tentu `is_canceled`, karena nilai median dari bivariat `previous_cancellations` dengan `is_canceled` adalah 0
    * begitu pula dengan `previous_bookings_not_canceled`, tapi kolom ini redundant & di drop
    * begitu pula dengan `booking_changes`
    * begitu pula dengan `days_in_waiting_list`?
* There are several options to handle outlier, namely:
    * Scaling
        * When the distribution of the variable is skewed, it is better to scale using the median and quantiles method, which is more robust to outliers.
    * Transformation
        * Since this analysis is in the context of a business setting, it is better to use the original variable without transformation to train the model, as this would represent a simpler situation at the time of asking developers to implement the model in real life, and also it will be easier to interpret.
    * Binning
        * Equal frequency binning is straightforward to implement, and by spreading the values of the observations more evenly, it may help boost the model's performance. On the other hand, arbitrary binning (including equal width binning) may also disrupt the relationship with the target on occasion.
        * When engineering variables in a business setting, the business experts determine the intervals in which they think the variable should be divided so that it makes sense for the business. These intervals may be defined both arbitrarily or following some criteria of use to the business. Typical variables to be applied using this domain knowledge discretization are Age and Income. 
        * Therefore, whenever possible, it will bring value to examine whether such type of binning is the right strategy, and it will depend on the variable and the model that will be used to make the prediction.

## Cardinality

In [30]:
# Listing categorical variables
categorical_vars = df.select_dtypes(include='object')

# Showing summary of cardinality in categorical variables
pd.DataFrame(data={'categorical_columns': categorical_vars.columns,
                   'unique (n)': categorical_vars.nunique(dropna=False).values,
                   'unique_sample': [df[var].unique() for var in categorical_vars],
                   'rare_label (n)': [sum(df[var].value_counts(normalize=True, dropna=False) < 0.05) for var in categorical_vars],
                   'rare_label_sample': [df[df[var].map(arg=df[var].value_counts(normalize=True, dropna=False) < 0.05)][var].unique() 
                                         for var in categorical_vars]})

Unnamed: 0,categorical_columns,unique (n),unique_sample,rare_label (n),rare_label_sample
0,hotel,2,"[Resort Hotel, City Hotel]",0,[]
1,arrival_date_month,12,"[July, August, September, October, November, D...",0,[]
2,meal,4,"[BB, FB, HB, SC]",1,[FB]
3,country,177,"[PRT, GBR, USA, ESP, IRL, FRA, ROU, NOR, OMN, ...",172,"[USA, IRL, ROU, NOR, OMN, ARG, POL, BEL, CHE, ..."
4,market_segment,7,"[Direct, Corporate, Online TA, Offline TA/TO, ...",3,"[Corporate, Complementary, Aviation]"
5,distribution_channel,5,"[Direct, Corporate, TA/TO, Undefined, GDS]",2,"[Undefined, GDS]"
6,reserved_room_type,10,"[C, A, D, E, G, F, H, L, B, P]",7,"[C, G, F, H, L, B, P]"
7,assigned_room_type,12,"[C, A, D, E, G, F, I, B, H, L, K, P]",9,"[C, G, F, I, B, H, L, K, P]"
8,deposit_type,3,"[No Deposit, Refundable, Non Refund]",2,"[Refundable, Non Refund]"
9,customer_type,4,"[Transient, Contract, Transient-Party, Group]",2,"[Contract, Group]"


Notes:

* The number of different (unique) labels within a categorical variable is known as cardinality. A high number of labels within a variable is known as high cardinality.
    * Also different categories appear in the dataset with different frequencies. Some labels appear a lot in the dataset, whereas some other labels appear only in a few number of observations —hence called rare labels.
* High cardinality & rare values may pose the following problems:
    * A big number of labels (whether frequent or infrequent) within a variable may introduce noise with little, if any, information, therefore making machine learning model prone to over-fit.
    * Some of the labels may only be present in the training data set, but not in the test set, therefore machine learning algorithm may over-fit to the training set.
    * Contrarily, some labels may appear only in the test set, therefore leaving the machine learning algorithm unable to perform a calculation over the new (unseen) observation —a case of model underfit.
    * In particular, tree-based model can be biased towards variable with lots of labels. Thus, their performance may be affected by high cardinality.
    * Sometimes rare values are indeed important. If we are building a model to predict hotel booking cancellation, which are —by nature— rare compared to successful booking, then a rare value in a certain variable may be very predictive. This rare value could be telling us that the observation is most likely a cancelled booking, and it would be best not to ignore it.
* High number of cardinality in dataset are detected in `country`, `agent`, `company`, and `reservation_status_date` (relative to other variables), while rare labels are found in all variables except `hotel` and `arrival_date_month`
* solution: 
    1. reduce by recategorize, either all categories or only specific labels deemed rare
    2. use the labels as-is (without any modifications)


* `lead_time` binning 1 bulan, 2-12 bulan, > 12 bulan
* `stays_in_nights` berarti checkout malam; bisa aja guest checkout siang sehingga `stays_in_nights` = 0
* `adult` + `children` + `babies` = `total_guest`
    * `total_guest` = 0 maka missing value & drop
* lengkapi EDA

---

# Modeling

In [158]:
# Importing libraries for prediction modeling
from sklearn.model_selection import train_test_split
from pycaret.internal.preprocess.transformers import TransformerWrapper
from category_encoders import OrdinalEncoder, BinaryEncoder
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from pycaret.classification import *
import numpy as np
from sklearn.metrics import fbeta_score
from feature_engine.encoding import RareLabelEncoder

In [32]:
# Setting arbitrary number to 'random_state'
random_state_ = 1

# Splitting dataset into dataframes for training and testing
df_train, df_test = train_test_split(df, random_state=random_state_, stratify=df.is_canceled)

# Showing dataframes' summary
display(pd.DataFrame(data={'dataset': ['df_train', 'df_test'],
                   'row (n)': [df_train.shape[0], df_test.shape[0]],
                   'row (%)': [df_train.shape[0] / df.shape[0] * 100, df_test.shape[0] / df.shape[0] * 100],
                   'column (n)': [df_train.shape[1], df_test.shape[1]]}))

display(pd.DataFrame(data={'df_train': round(number=df_train.is_canceled.value_counts(normalize=True) * 100, ndigits=3),
                   'df_test': round(number=df_test.is_canceled.value_counts(normalize=True) * 100, ndigits=3)}).T)

Unnamed: 0,dataset,row (n),row (%),column (n)
0,df_train,65205,75.0,33
1,df_test,21735,25.0,33


Unnamed: 0,0,1
df_train,72.41,27.59
df_test,72.409,27.591


Notes:
* Splitting is done to partition dataset into:
    * Training set (`df_train`), which is used to train the machine learning model. This is the core dataset where the model learns to understand patterns and relationship in the data.
    * Test set (`df_test`), which provides a basis for evaluating the model's performance on unseen data.
        * Even though `df_test` originally comes from the same `df` dataset, this set is not used in training the model, and its target (`is_canceled`) will be hidden in the later stage of this modeling section.
        * This dataset is crucial for assessing the model's ability to generalize into unknown data.
* `random_state` is required to control the random sampling process applied to dataset before splitting into `df_train` and `df_test` —including data for validation, which will be implemented later on.
    * This is done so that the result of splitting (rows, columns, and all their values/data points/observations) will be identical for each time the programming code in this modeling section is called.
* By applying `stratify` parameter, the dataset is sampled in a stratified fashion, to ensure that relative class/target `is_canceled` frequencies is approximately preserved in each train and validation fold.
    * tapi masih imbalance...

In [173]:
# Creating custom pipeline for modeling
pipeline = [
    ('ordinal', TransformerWrapper(
        transformer=OrdinalEncoder(mapping=[{'col': 'meal', 
                                             'mapping': {'SC': 0, 
                                                         'BB': 1, 
                                                         'HB': 2, 
                                                         'FB': 3}}]))),
    ('binary', TransformerWrapper(
        include=['country'], 
        transformer=BinaryEncoder())),
    ('onehot', TransformerWrapper(
        include=['arrival_date_month', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type'], 
        transformer=OneHotEncoder(handle_unknown='ignore')))]

Notes:

* `meal` encoded with `OrdinalEncoder` because categories in this variable can be meaningfully ordered from the type of Hotel meal booked: ` SC < BB < HB < FB`.
* Several categorical variables are encoded with `OneHotEncoder`:
    * `distribution_channel`, `reserved_room_type`, `deposit_type`, and `customer_type`, because these columns contain a small amount of cardinality.
    * Encoding/replacing these variables to boolean variables of 1 and 0 would still relatively increase the feature space (number of variables used in model training), but not to a large degree.
    * `handle_unknown` parameter applied to `ignore` unknown categories found in the encoding process.
        * After splitting the dataset, certain categories may be found in `df_train` but not in `df_test`, for example.
        * By using this parameter, categories only encountered in `df_test` will all be encoded to zeros.
* `country` encoded with `BinaryEncoder` because, without any modifications to its original categories, this variable contains a large amount of cardinality
    * Encoding this variable with `OneHotEncoder` would result in a large feature space, and further slowing down the training time of the model
    * `BinaryEncoder` is appropriate to use in this situation; this encoding scheme uses binary value to split a categorical feature into several columns
        * In the case of `country`, 177 categories in this variable will be converted to its binary value of 10110001 —a value with 8 digits. This binary is then used to split this variable into 8 columns.
        * Each categories in this variable will be converted to numerical (using ordinal encoding scheme) and then transformed to binary value. Each binary value would then be plotted accordingly to these 8 columns.

## Experiment #0: Basic Model

In [34]:
# Defining keyword arguments for PyCaret setup
kwargs = {
    'data': df_train,
    'target': 'is_canceled',
    'preprocess': False, # Disabling PyCaret's in-built preprocessing pipeline
    'session_id': random_state_, # Using defined random state for train-validation splitting
    'ignore_features': ['hotel', 'arrival_date_week_number', 'total_stays_in_nights', 'total_guests', 'market_segment', 'previous_bookings_not_canceled', 
                        'assigned_room_type', 'agent', 'company', 'revenue', 'reservation_status', 'reservation_status_date']}

# Initializing model training environment
experiment0 = setup(**kwargs, 
                    custom_pipeline=pipeline) # Implementing custom pipeline

Unnamed: 0,Description,Value
0,Session id,1
1,Target,is_canceled
2,Target type,Binary
3,Original data shape,"(65205, 33)"
4,Transformed data shape,"(65205, 58)"
5,Transformed train set shape,"(45643, 58)"
6,Transformed test set shape,"(19562, 58)"
7,Ignore features,12
8,Numeric features,15
9,Categorical features,7


Notes:

* ignore features berasal dari...
* pycaret pakai default stratifiedshufflesplit untuk crossval
* test set = validation set

In [35]:
# Validating usage of custom pipeline
experiment0.get_config(variable='pipeline')

In [36]:
# Validating feature transformation using custom pipeline
X_train_transformed = experiment0.get_config(variable='X_train_transformed')
with pd.option_context('display.max_columns', None):
    display(X_train_transformed)

Unnamed: 0,lead_time,arrival_date_year,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,arrival_date_month_May,arrival_date_month_November,arrival_date_month_October,arrival_date_month_September,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country_0,country_1,country_2,country_3,country_4,country_5,country_6,country_7,distribution_channel_Corporate,distribution_channel_Direct,distribution_channel_GDS,distribution_channel_TA/TO,distribution_channel_Undefined,is_repeated_guest,previous_cancellations,reserved_room_type_A,reserved_room_type_B,reserved_room_type_C,reserved_room_type_D,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,booking_changes,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,days_in_waiting_list,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,adr,required_car_parking_spaces,total_of_special_requests
57405,3,2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19,1,3,1,0,0,1,0,0,0,0,0,0,0,1,0.0,0.0,0.0,1.0,0.0,1,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,1.0,0.0,110.000000,0,0
20559,172,2016,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1,5,2,1,1,1,0,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,1.0,0.0,155.649994,1,2
48057,169,2017,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,2,3,2,1,0,1,0,0,0,0,0,0,1,1,0.0,0.0,0.0,1.0,0.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,1.0,0.0,117.000000,0,1
24493,92,2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8,0,4,2,0,0,1,0,0,0,0,0,1,0,0,0.0,0.0,0.0,1.0,0.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,0.0,1.0,37.799999,0,0
30547,15,2017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,30,0,4,2,0,0,1,0,0,0,0,0,1,0,1,0.0,0.0,0.0,1.0,0.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,1.0,0.0,84.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33952,0,2015,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,2,0,0,0,0,1,0,0,0,0,0,0,1,1,0.0,0.0,0.0,1.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.000000,0,0
50538,199,2017,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6,2,3,2,0,0,0,0,0,0,0,1,0,1,0,0.0,0.0,0.0,1.0,0.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,1.0,0.0,89.099998,0,1
71420,182,2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,31,1,3,0,2,0,1,0,0,0,0,1,0,1,0,0.0,0.0,0.0,1.0,0.0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,1.0,0.0,83.500000,0,1
74739,2,2017,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,0,1,2,0,0,0,0,0,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,1.0,0.0,88.000000,0,0


Notes:

* Setelah kita berhasil mentransformasi data *train*, berikutnya dapat digunakan untuk pemodelan regresi.  
* PyCaret menyediakan kumpulan (*library*) model yang dapat diakses dengan fungsi `compare_models()`, sehingga kita bisa membandingkan performa serangkaian model dan memilih beberapa *estimator* saja sebagai *benchmark* untuk tahapan pemodelan berikutnya.

In [37]:
# Defining Hotel Cost per class prediction error
false_negative = 175
false_positive = (225 + 375) / 2

# Defining function to calculate Hotel Cost
def calculate_cost(y, y_pred):
    FN = np.where((y_pred == 0) & (y == 1), false_negative, 0)
    FP = np.where((y_pred == 1) & (y == 0), false_positive, 0)
    return np.sum(a=[FN, FP])

# Adding 'F-beta' score & Hotel Cost metric
experiment0.add_metric(id='F2', name='F2', score_func=fbeta_score, beta=0.5)
experiment0.add_metric(id='cost', name='Cost', score_func=calculate_cost, greater_is_better=False)

# Comparing performance of models available in PyCaret
holdout_experiment0 = experiment0.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,F2,Cost,TT (Sec)
catboost,CatBoost Classifier,0.8258,0.8813,0.5755,0.7359,0.6458,0.5326,0.5398,0.697,171645.0,20.149
rf,Random Forest Classifier,0.8233,0.8742,0.5442,0.7469,0.6296,0.5173,0.5285,0.6951,170200.0,6.383
xgboost,Extreme Gradient Boosting,0.8221,0.8777,0.5783,0.7219,0.6421,0.5257,0.5315,0.6877,177195.0,7.414
lightgbm,Light Gradient Boosting Machine,0.8206,0.8754,0.553,0.7317,0.6298,0.5144,0.5233,0.6872,175277.5,0.855
et,Extra Trees Classifier,0.8133,0.8578,0.5316,0.7186,0.6111,0.4918,0.5015,0.6713,181965.0,7.482
gbc,Gradient Boosting Classifier,0.7958,0.84,0.4206,0.7236,0.5319,0.4128,0.4377,0.6324,188460.0,6.5
ada,Ada Boost Classifier,0.7792,0.8128,0.4108,0.6607,0.5065,0.3741,0.3917,0.5889,209590.0,2.379
lr,Logistic Regression,0.7696,0.7864,0.346,0.6566,0.4529,0.3243,0.3511,0.5563,212590.0,9.984
lda,Linear Discriminant Analysis,0.7655,0.7806,0.3002,0.6662,0.4135,0.2925,0.3289,0.5351,210987.5,0.959
ridge,Ridge Classifier,0.7622,0.0,0.2435,0.6977,0.3607,0.2545,0.308,0.5077,206532.5,0.298


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

Notes:

* Highlight bug in `Cost`?
* `CatBoost Classifier` higher in F2, but `Random Forest Classifier` higher in Precision and lower Cost metrics

In [38]:
# Assigning 'Random Forest Classifier' as basic model
model_experiment0 = experiment0.create_model(estimator='rf',
                                             return_train_score=True) # Showing cross-validation score

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,F2,Cost
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CV-Train,0,0.9979,1.0,0.9946,0.9976,0.9961,0.9946,0.9946,0.997,18775.0
CV-Train,1,0.9979,1.0,0.9945,0.9978,0.9962,0.9947,0.9947,0.9971,18350.0
CV-Train,2,0.9977,1.0,0.9948,0.9969,0.9958,0.9943,0.9943,0.9965,20825.0
CV-Train,3,0.9979,1.0,0.9954,0.997,0.9962,0.9948,0.9948,0.9967,19300.0
CV-Train,4,0.9979,1.0,0.9955,0.9968,0.9962,0.9947,0.9947,0.9966,19725.0
CV-Train,5,0.998,1.0,0.9958,0.9968,0.9963,0.9949,0.9949,0.9966,19200.0
CV-Train,6,0.9981,1.0,0.9958,0.9973,0.9965,0.9952,0.9952,0.997,17700.0
CV-Train,7,0.9977,1.0,0.9949,0.9969,0.9959,0.9943,0.9943,0.9965,20650.0
CV-Train,8,0.998,1.0,0.9955,0.9973,0.9964,0.995,0.995,0.9969,18225.0
CV-Train,9,0.9979,1.0,0.9952,0.997,0.9961,0.9946,0.9946,0.9966,19650.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Notes:

* Will be used as benchmark against model from other experimentation, including the metrics

## Experiment #1: Handling Outlier with Scaling

In [174]:
# Adding 'RobustScaler' to custom pipeline
pipeline.append(
    ('robust', TransformerWrapper(
        include=['lead_time', 'arrival_date_year', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights',
                 'adults', 'children', 'babies', 'previous_cancellations', 'booking_changes', 'days_in_waiting_list', 'adr', 
                 'required_car_parking_spaces', 'total_of_special_requests'],
        transformer=RobustScaler())))

# Setting experiment #1 training environment
experiment1 = setup(**kwargs, custom_pipeline=pipeline)

Unnamed: 0,Description,Value
0,Session id,1
1,Target,is_canceled
2,Target type,Binary
3,Original data shape,"(65205, 33)"
4,Transformed data shape,"(65205, 58)"
5,Transformed train set shape,"(45643, 58)"
6,Transformed test set shape,"(19562, 58)"
7,Ignore features,12
8,Numeric features,15
9,Categorical features,7


In [40]:
# Validating usage of custom pipeline
experiment1.get_config(variable='pipeline')

In [41]:
# Validating feature transformation using custom pipeline
X_train_transformed = experiment1.get_config(variable='X_train_transformed')
with pd.option_context('display.max_columns', None):
    display(X_train_transformed)

Unnamed: 0,lead_time,arrival_date_year,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,arrival_date_month_May,arrival_date_month_November,arrival_date_month_October,arrival_date_month_September,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country_0,country_1,country_2,country_3,country_4,country_5,country_6,country_7,distribution_channel_Corporate,distribution_channel_Direct,distribution_channel_GDS,distribution_channel_TA/TO,distribution_channel_Undefined,is_repeated_guest,previous_cancellations,reserved_room_type_A,reserved_room_type_B,reserved_room_type_C,reserved_room_type_D,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,booking_changes,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,days_in_waiting_list,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,adr,required_car_parking_spaces,total_of_special_requests
57405,-0.412281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.200000,0.0,0.333333,-1.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.175705,0.0,0.0
20559,1.070175,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.800000,0.0,1.000000,0.0,1.0,1.0,1,0,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.904880,1.0,2.0
48057,1.043860,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.5,0.333333,0.0,1.0,0.0,1,0,0,0,0,0,0,1,1,0.0,0.0,0.0,1.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.287517,0.0,1.0
24493,0.368421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.533333,-0.5,0.666667,0.0,0.0,0.0,1,0,0,0,0,0,1,0,0,0.0,0.0,0.0,1.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.977558,0.0,0.0
30547,-0.307018,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.933333,-0.5,0.666667,0.0,0.0,0.0,1,0,0,0,0,0,1,0,1,0.0,0.0,0.0,1.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.239597,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33952,-0.438596,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.466667,0.5,-0.666667,-2.0,0.0,0.0,1,0,0,0,0,0,0,1,1,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.581343,0.0,0.0
50538,1.307018,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.666667,0.5,0.333333,0.0,0.0,0.0,0,0,0,0,0,1,0,1,0,0.0,0.0,0.0,1.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.158134,0.0,1.0
71420,1.157895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.000000,0.0,0.333333,-2.0,2.0,0.0,1,0,0,0,0,1,0,1,0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.247584,0.0,1.0
74739,-0.421053,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,-0.5,-0.333333,0.0,0.0,0.0,0,0,0,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.175705,0.0,0.0


In [43]:
# Adding 'F-beta' score & Hotel Cost metric
experiment1.add_metric(id='F2', name='F2', score_func=fbeta_score, beta=0.5)
experiment1.add_metric(id='cost', name='Cost', score_func=calculate_cost, greater_is_better=False)

# Comparing performances of models in experiment #1
holdout_experiment1 = experiment1.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,F2,Cost,TT (Sec)
catboost,CatBoost Classifier,0.8258,0.8813,0.5755,0.7359,0.6458,0.5326,0.5398,0.697,171645.0,22.096
rf,Random Forest Classifier,0.8233,0.8742,0.5445,0.7467,0.6297,0.5174,0.5285,0.695,170250.0,5.756
xgboost,Extreme Gradient Boosting,0.8221,0.8777,0.5783,0.7219,0.6421,0.5257,0.5315,0.6877,177195.0,7.976
lightgbm,Light Gradient Boosting Machine,0.8205,0.8754,0.5526,0.7315,0.6295,0.5141,0.523,0.6869,175335.0,0.846
et,Extra Trees Classifier,0.8133,0.8578,0.5316,0.7186,0.6111,0.4918,0.5015,0.6713,181965.0,8.199
gbc,Gradient Boosting Classifier,0.7958,0.84,0.4206,0.7236,0.5319,0.4128,0.4377,0.6324,188460.0,6.958
ada,Ada Boost Classifier,0.7792,0.8128,0.4108,0.6607,0.5065,0.3741,0.3917,0.5889,209590.0,2.749
lr,Logistic Regression,0.7701,0.7885,0.3477,0.6573,0.4546,0.3262,0.3527,0.5577,212175.0,3.372
knn,K Neighbors Classifier,0.7694,0.7721,0.5065,0.5969,0.5479,0.3945,0.3969,0.5763,238122.5,2.759
lda,Linear Discriminant Analysis,0.7655,0.7806,0.3002,0.6662,0.4135,0.2925,0.3289,0.5351,210987.5,1.141


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

Notes:

* Similar with experimentation before

In [44]:
# Assigning 'Random Forest Classifier' as experimentation model
model_experiment1 = experiment1.create_model(estimator='rf', 
                                             return_train_score=True) # Showing cross-validation score

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,F2,Cost
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CV-Train,0,0.9979,1.0,0.9946,0.9976,0.9961,0.9946,0.9946,0.997,18775.0
CV-Train,1,0.9979,1.0,0.9945,0.9978,0.9962,0.9947,0.9947,0.9971,18350.0
CV-Train,2,0.9977,1.0,0.9948,0.9969,0.9958,0.9943,0.9943,0.9965,20825.0
CV-Train,3,0.9979,1.0,0.9955,0.9969,0.9962,0.9948,0.9948,0.9966,19425.0
CV-Train,4,0.9979,1.0,0.9954,0.9969,0.9962,0.9947,0.9947,0.9966,19600.0
CV-Train,5,0.998,1.0,0.9958,0.9968,0.9963,0.9949,0.9949,0.9966,19200.0
CV-Train,6,0.9981,1.0,0.9958,0.9973,0.9965,0.9952,0.9952,0.997,17700.0
CV-Train,7,0.9977,1.0,0.9949,0.9969,0.9959,0.9943,0.9943,0.9965,20650.0
CV-Train,8,0.998,1.0,0.9954,0.9973,0.9964,0.995,0.995,0.997,18100.0
CV-Train,9,0.9979,1.0,0.9952,0.997,0.9961,0.9946,0.9946,0.9966,19650.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [108]:
# ...
experiment0_mean = experiment0.pull().loc[('CV-Val', 'Mean'), ['Prec.', 'F2', 'Cost']]
experiment0_std = experiment0.pull().loc[('CV-Val', 'Std'), ['Prec.', 'F2', 'Cost']]
experiment1_mean = experiment1.pull().loc[('CV-Val', 'Mean'), ['Prec.', 'F2', 'Cost']]
experiment1_std = experiment1.pull().loc[('CV-Val', 'Std'), ['Prec.', 'F2', 'Cost']]

# ...
pd.DataFrame(data=[abs((experiment0_mean - experiment0_std) - (experiment0_mean + experiment0_std)),
                   abs((experiment1_mean - experiment1_std) - (experiment1_mean + experiment1_std))],
             index=['Experiment #0', 'Experiment #1'])

Unnamed: 0,Prec.,F2,Cost
Experiment #0,0.0326,0.0258,12818.5022
Experiment #1,0.033,0.0258,12909.1634


Notes:

* measures the absolute range between mean and standard deviation from each experimentation
    * no significant difference
    * experiment with the lowest/narrowest range is the better one
* As stated previously, Tree-based algorithm (including Random Forest) tend to ignore the presence of outliers when creating the branches of their trees. Typically, trees make decisions by asking if variable `x` >= a certain value. Therefore, the outlier will fall on each side of the branch, but it will be treated equally than the remaining values, regardless of its magnitude.

## Experiment #2: Handling Cardinality/Rare Label

In [175]:
# ...
pipeline.pop()

# Adding 'RareLabelEncoder' to custom pipeline
pipeline.insert(1, 
    ('rare_label', TransformerWrapper(
        include=['meal', 'country', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type'],
        transformer=RareLabelEncoder(n_categories=3))))

# Setting experiment #2 training environment
experiment2 = setup(**kwargs, custom_pipeline=pipeline)

Unnamed: 0,Description,Value
0,Session id,1
1,Target,is_canceled
2,Target type,Binary
3,Original data shape,"(65205, 33)"
4,Transformed data shape,"(65205, 46)"
5,Transformed train set shape,"(45643, 46)"
6,Transformed test set shape,"(19562, 46)"
7,Ignore features,12
8,Numeric features,15
9,Categorical features,7


Notes:

* insert before binary because...
* n_categories...
* Reduced number of column

In [176]:
# Validating usage of custom pipeline
experiment2.get_config(variable='pipeline')

In [177]:
# Validating feature transformation using custom pipeline
X_train_transformed = experiment2.get_config(variable='X_train_transformed')
with pd.option_context('display.max_columns', None):
    display(X_train_transformed)

Unnamed: 0,lead_time,arrival_date_year,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,arrival_date_month_May,arrival_date_month_November,arrival_date_month_October,arrival_date_month_September,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country_0,country_1,country_2,distribution_channel_Corporate,distribution_channel_Direct,distribution_channel_Rare,distribution_channel_TA/TO,is_repeated_guest,previous_cancellations,reserved_room_type_A,reserved_room_type_D,reserved_room_type_E,reserved_room_type_Rare,booking_changes,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,days_in_waiting_list,customer_type_Rare,customer_type_Transient,customer_type_Transient-Party,adr,required_car_parking_spaces,total_of_special_requests
57405,3,2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19,1,3,1,0,0,1,0,0,1,0.0,0.0,0.0,1.0,1,0,0.0,1.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,1.0,0.0,110.000000,0,0
20559,172,2016,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1,5,2,1,1,1,0,1,0,0.0,1.0,0.0,0.0,0,0,0.0,0.0,0.0,1.0,0,1.0,0.0,0.0,0,0.0,1.0,0.0,155.649994,1,2
48057,169,2017,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,2,3,2,1,0,1,0,1,1,0.0,0.0,0.0,1.0,0,0,1.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,1.0,0.0,117.000000,0,1
24493,92,2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8,0,4,2,0,0,1,0,1,0,0.0,0.0,0.0,1.0,0,0,1.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,1.0,37.799999,0,0
30547,15,2017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,30,0,4,2,0,0,1,0,1,0,0.0,0.0,0.0,1.0,0,0,1.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,1.0,0.0,84.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33952,0,2015,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,2,0,0,0,0,1,0,1,1,0.0,0.0,0.0,1.0,0,0,0.0,0.0,0.0,1.0,0,1.0,0.0,0.0,0,1.0,0.0,0.0,0.000000,0,0
50538,199,2017,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6,2,3,2,0,0,0,1,0,0,0.0,0.0,0.0,1.0,0,0,1.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,1.0,0.0,89.099998,0,1
71420,182,2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,31,1,3,0,2,0,1,1,0,0,0.0,0.0,0.0,1.0,0,0,0.0,0.0,0.0,1.0,0,1.0,0.0,0.0,0,0.0,1.0,0.0,83.500000,0,1
74739,2,2017,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,0,1,2,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0,0,1.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,1.0,0.0,88.000000,0,0


Notes:

* Rare...

In [178]:
# Adding 'F-beta' score & Hotel Cost metric
experiment2.add_metric(id='F2', name='F2', score_func=fbeta_score, beta=0.5)
experiment2.add_metric(id='cost', name='Cost', score_func=calculate_cost, greater_is_better=False)

# Comparing performances of models in experiment #1
holdout_experiment2 = experiment2.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,F2,Cost,TT (Sec)
rf,Random Forest Classifier,0.8226,0.8744,0.5515,0.7397,0.6318,0.5182,0.528,0.6924,172280.0,5.49
et,Extra Trees Classifier,0.8114,0.8558,0.5422,0.7062,0.6134,0.4915,0.499,0.6659,186177.5,6.324
gbc,Gradient Boosting Classifier,0.8029,0.8497,0.4447,0.7372,0.5546,0.4378,0.461,0.6513,182407.5,5.336
ada,Ada Boost Classifier,0.7869,0.8157,0.42,0.6858,0.5209,0.3939,0.4135,0.6086,200510.0,1.946
lr,Logistic Regression,0.7749,0.7867,0.3573,0.6734,0.4666,0.3407,0.3683,0.5718,207027.5,9.023
lda,Linear Discriminant Analysis,0.7693,0.7814,0.2965,0.6909,0.4144,0.2986,0.3401,0.545,205102.5,0.733
ridge,Ridge Classifier,0.7652,0.0,0.2405,0.7251,0.3607,0.2591,0.3187,0.5159,201930.0,0.481
dt,Decision Tree Classifier,0.7572,0.7014,0.5737,0.5584,0.5659,0.3974,0.3975,0.5614,265420.0,0.538
svm,SVM - Linear Kernel,0.6949,0.0,0.2437,0.4075,0.2472,0.1353,0.1609,0.3105,298700.0,1.483
knn,K Neighbors Classifier,0.6937,0.6412,0.2914,0.4204,0.3442,0.1529,0.1572,0.3862,307922.5,2.206


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [None]:
# Assigning 'Random Forest Classifier' as experimentation model
model_experiment2 = experiment2.create_model(estimator='rf', 
                                             return_train_score=True) # Showing cross-validation score

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,F2,Cost
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CV-Train,0,0.9979,1.0,0.9946,0.9976,0.9961,0.9946,0.9946,0.997,18775.0
CV-Train,1,0.9979,1.0,0.9945,0.9978,0.9962,0.9947,0.9947,0.9971,18350.0
CV-Train,2,0.9977,1.0,0.9948,0.9969,0.9958,0.9943,0.9943,0.9965,20825.0
CV-Train,3,0.9979,1.0,0.9955,0.9969,0.9962,0.9948,0.9948,0.9966,19425.0
CV-Train,4,0.9979,1.0,0.9954,0.9969,0.9962,0.9947,0.9947,0.9966,19600.0
CV-Train,5,0.998,1.0,0.9958,0.9968,0.9963,0.9949,0.9949,0.9966,19200.0
CV-Train,6,0.9981,1.0,0.9958,0.9973,0.9965,0.9952,0.9952,0.997,17700.0
CV-Train,7,0.9977,1.0,0.9949,0.9969,0.9959,0.9943,0.9943,0.9965,20650.0
CV-Train,8,0.998,1.0,0.9954,0.9973,0.9964,0.995,0.995,0.997,18100.0
CV-Train,9,0.9979,1.0,0.9952,0.997,0.9961,0.9946,0.9946,0.9966,19650.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]