In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import lines, patches
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, MinMaxScaler
import category_encoders as ce
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score

# column transformer untuk Pycaret
from pycaret.internal.preprocess.transformers import TransformerWrapper

# my own function
import function as fnk

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("../data/raw/data_hotel_booking_demand.csv")
data.sample(10)

Unnamed: 0,country,market_segment,previous_cancellations,booking_changes,deposit_type,days_in_waiting_list,customer_type,reserved_room_type,required_car_parking_spaces,total_of_special_requests,is_canceled
53006,PRT,Online TA,0,0,No Deposit,0,Transient,A,0,0,0
9025,NOR,Groups,0,0,No Deposit,0,Transient-Party,A,0,0,0
40207,PRT,Direct,0,2,No Deposit,0,Transient,E,1,0,0
67937,ESP,Offline TA/TO,0,0,No Deposit,0,Transient,A,1,0,0
16938,PRT,Corporate,0,1,No Deposit,0,Transient-Party,A,0,0,1
1957,PRT,Groups,1,0,No Deposit,0,Transient-Party,A,0,0,1
41241,PRT,Online TA,0,0,No Deposit,0,Transient,H,0,0,1
45353,FRA,Online TA,0,0,No Deposit,0,Transient,A,0,2,1
67274,PRT,Offline TA/TO,1,0,No Deposit,0,Transient-Party,A,0,0,1
27678,IRL,Online TA,0,0,No Deposit,0,Transient,A,0,3,0


In [3]:
display(
    # numerical variable
    data.describe(),
    # categorical variable
    data.describe(include='object')
)

Unnamed: 0,previous_cancellations,booking_changes,days_in_waiting_list,required_car_parking_spaces,total_of_special_requests,is_canceled
count,83573.0,83573.0,83573.0,83573.0,83573.0,83573.0
mean,0.086798,0.220897,2.330561,0.062999,0.573211,0.368277
std,0.841011,0.648635,17.673051,0.246919,0.795163,0.48234
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,1.0,1.0
max,26.0,21.0,391.0,8.0,5.0,1.0


Unnamed: 0,country,market_segment,deposit_type,customer_type,reserved_room_type
count,83222,83573,83573,83573,83573
unique,162,8,3,4,10
top,PRT,Online TA,No Deposit,Transient,A
freq,34097,39460,73352,62732,60041


## `Data Preperation`

In [4]:
# Diskritisasi Kolom Numerical

# didiskritisasi dengan apakah pernah melakukan cancel (1) atau tidak (0)
data['previous_cancellations'] = data['previous_cancellations'].apply(lambda x: 1 if x >= 1 else 0)

# didiskritisasi dengan apakah melakukan request tambahan (1) atau tidak (0)
data['total_of_special_requests'] = data['total_of_special_requests'].apply(lambda x: 1 if x >= 1 else 0)

# didiskritisasi dengan apakah langsung dikonfirmasi oleh pihak hotel (1) atau tidak (0)
data['days_in_waiting_list'] = data['days_in_waiting_list'].apply(lambda x: 1 if x>=1 else 0)

# didiskritisasi dengan apakah meminta parkir mobil (1) atau tidak (0)
data['required_car_parking_spaces'] = data['required_car_parking_spaces'].apply(lambda x: 1 if x>=1 else 0)

In [5]:
# Diskritisasi Kolom Categorical

# didiskritisasi dengan market_segment yang termasuk ke dalam complementary, aviation, undefined dan tidak termasuk ke dalam Online TA, Direct, Offline TA/TO, Groups, Corporate akan dimasukkan ke dalam kelompok "Others"
data['market_segment'] = data['market_segment'].apply(lambda x: 'Others' if x in ['Complementary', 'Aviation', 'Undefined'] and x not in ['Online TA', 'Direct', 'Offline TA/TO', 'Groups', 'Corporate'] else x)

# didiskritisasi dengan reserved_room_type yang termasuk ke dalam F, G, B, C, H, P, L dan tidak termasuk ke dalam A, D, E, akan kelompokkan dengan "Others"
data['reserved_room_type'] = data['reserved_room_type'].apply(lambda x: 'Others' if x in ['F', 'G','B', 'C', 'H', 'P', 'L'] and x not in ['A', 'D', 'E'] else x)

# didiskritisasi dengan deposit_type yang termasuk ke dalam Refundable, Non Refund dan tidak sama dengan "Deposit" akan dikelompokkan dengan 1 (melakukan deposit) dan selain itu (0) tidak melakukan deposit
data['deposit_type'] = data['deposit_type'].apply(lambda x: 1 if x in ['Refundable', 'Non Refund'] and x != 'No Deposit' else 0)

# didiskritisasi dengan customer_type yang termasuk ke dalam Contract, group dan tidak termasuk ke dalam transient & Transient-Party akan dikelompokkan dengan "Others"
data['customer_type'] = data['customer_type'].apply(lambda x: 'Others' if x in ['Contract', 'Group'] and x not in ['Transient', 'Transient-Party'] else x)

# drop kolom country dan booking change
