In [3]:
import matplotlib.pyplot as plt
# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)
plt.rc('animation', html='html5')

# 한글 폰트 설정 (MacOS 기준)
plt.rcParams["font.family"] = "AppleGothic"
plt.rcParams["axes.unicode_minus"] = False

In [4]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

hotel = pd.read_csv('dataset/hotel.csv')
hotel

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


In [10]:
hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [11]:
hotel["hotel"].unique()

array(['Resort Hotel', 'City Hotel'], dtype=object)

In [5]:
X = hotel.copy()
y = X.pop('is_canceled')

In [6]:
X['arrival_date_month'] = \
    X['arrival_date_month'].map(
        {'January':1, 'February': 2, 'March':3,
         'April':4, 'May':5, 'June':6, 'July':7,
         'August':8, 'September':9, 'October':10,
         'November':11, 'December':12}
    )

features_num = [
    "lead_time", "arrival_date_week_number",
    "arrival_date_day_of_month", "stays_in_weekend_nights",
    "stays_in_week_nights", "adults", "children", "babies",
    "is_repeated_guest", "previous_cancellations",
    "previous_bookings_not_canceled", "required_car_parking_spaces",
    "total_of_special_requests", "adr",
]

features_cat = [
    "hotel", "arrival_date_month", "meal",
    "market_segment", "distribution_channel",
    "reserved_room_type", "deposit_type", "customer_type",
]

transformer_num = make_pipeline(
    SimpleImputer(strategy="constant"), # there are a few missing values
    StandardScaler(),
)
transformer_cat = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

In [12]:
# 전처리 후 컬럼명 확인
feature_names = preprocessor.get_feature_names_out()
print(feature_names)


['pipeline-1__lead_time' 'pipeline-1__arrival_date_week_number'
 'pipeline-1__arrival_date_day_of_month'
 'pipeline-1__stays_in_weekend_nights' 'pipeline-1__stays_in_week_nights'
 'pipeline-1__adults' 'pipeline-1__children' 'pipeline-1__babies'
 'pipeline-1__is_repeated_guest' 'pipeline-1__previous_cancellations'
 'pipeline-1__previous_bookings_not_canceled'
 'pipeline-1__required_car_parking_spaces'
 'pipeline-1__total_of_special_requests' 'pipeline-1__adr'
 'pipeline-2__hotel_City Hotel' 'pipeline-2__hotel_Resort Hotel'
 'pipeline-2__arrival_date_month_1' 'pipeline-2__arrival_date_month_2'
 'pipeline-2__arrival_date_month_3' 'pipeline-2__arrival_date_month_4'
 'pipeline-2__arrival_date_month_5' 'pipeline-2__arrival_date_month_6'
 'pipeline-2__arrival_date_month_7' 'pipeline-2__arrival_date_month_8'
 'pipeline-2__arrival_date_month_9' 'pipeline-2__arrival_date_month_10'
 'pipeline-2__arrival_date_month_11' 'pipeline-2__arrival_date_month_12'
 'pipeline-2__meal_BB' 'pipeline-2__meal_FB

In [7]:
# stratify - make sure classes are evenly represented across splits
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, train_size=0.75)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

input_shape = [X_train.shape[1]]

In [8]:
X_train

array([[-0.36605002, -0.89492138, -0.20410334, ...,  0.        ,
         0.        ,  1.        ],
       [-0.97327768, -1.40983869,  0.70715898, ...,  0.        ,
         0.        ,  1.        ],
       [-0.89854197,  0.06135363, -0.54582671, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.88920001, -0.67424253,  1.50451351, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.21315175,  0.28203247,  1.16279014, ...,  0.        ,
         1.        ,  0.        ],
       [-0.02973931, -0.89492138, -1.34318124, ...,  0.        ,
         1.        ,  0.        ]])

In [9]:
y_train

35263    0
33425    0
13918    1
31215    0
18234    0
        ..
52103    1
57941    1
88112    0
94342    0
48984    1
Name: is_canceled, Length: 89542, dtype: int64