In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("hotel_bookings.csv")
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29264 entries, 0 to 29263
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           29264 non-null  object 
 1   is_canceled                     29264 non-null  int64  
 2   lead_time                       29264 non-null  int64  
 3   arrival_date_year               29264 non-null  int64  
 4   arrival_date_month              29264 non-null  object 
 5   arrival_date_week_number        29264 non-null  int64  
 6   arrival_date_day_of_month       29264 non-null  int64  
 7   stays_in_weekend_nights         29264 non-null  int64  
 8   stays_in_week_nights            29264 non-null  int64  
 9   adults                          29264 non-null  int64  
 10  children                        29264 non-null  int64  
 11  babies                          29264 non-null  int64  
 12  meal                            

# Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder , OneHotEncoder

# Label Encoder

In [5]:
le = LabelEncoder()
data['hotel'] = le.fit_transform(data['hotel'])
print(data['hotel'].value_counts())
print(le.classes_)

0    29264
Name: hotel, dtype: int64
['Resort Hotel']


# One Hot Encoder

In [6]:
data['customer_type'].value_counts()

Transient          22533
Transient-Party     5316
Contract            1211
Group                204
Name: customer_type, dtype: int64

In [7]:
one_hot = OneHotEncoder()
transformed_data = one_hot.fit_transform(data['customer_type'].values.reshape(-1,1)).toarray()
one_hot.categories_

[array(['Contract', 'Group', 'Transient', 'Transient-Party'], dtype=object)]

In [8]:
transformed_data = pd.DataFrame(transformed_data , 
                                columns = ['Contract', 'Group', 'Transient', 'Transient-Party'])

transformed_data.head()

Unnamed: 0,Contract,Group,Transient,Transient-Party
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


In [9]:
transformed_data.iloc[90 , ]

Contract           0.0
Group              0.0
Transient          1.0
Transient-Party    0.0
Name: 90, dtype: float64

In [10]:
data['customer_type'][90]

'Transient'

# Normalization and Standardization

In [11]:
#considering only numeric columns
numeric_columns = [c for c in data.columns if data[c].dtype != np.dtype('O')]
numeric_columns.remove('company')
numeric_columns.remove('agent')
temp_data = data[numeric_columns]
temp_data

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,0,0,342,2015,27,1,0,0,2,0,0,0,0,0,3,0,0.00,0,0
1,0,0,737,2015,27,1,0,0,2,0,0,0,0,0,4,0,0.00,0,0
2,0,0,7,2015,27,1,0,1,1,0,0,0,0,0,0,0,75.00,0,0
3,0,0,13,2015,27,1,0,1,1,0,0,0,0,0,0,0,75.00,0,0
4,0,0,14,2015,27,1,0,2,2,0,0,0,0,0,0,0,98.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29259,0,0,386,2016,43,20,1,3,2,0,0,0,0,0,0,0,49.00,0,0
29260,0,0,77,2016,43,17,2,5,2,0,0,0,0,0,0,0,10.00,0,0
29261,0,0,134,2016,43,17,2,5,2,0,0,0,0,0,0,0,100.00,0,2
29262,0,0,220,2016,43,17,2,5,2,0,0,0,0,0,0,0,72.75,0,0


## Normalization

In [12]:
from sklearn.preprocessing import StandardScaler , MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
normalizer = MinMaxScaler()

In [13]:
temp_data.dropna(axis = 1 , inplace = True)
normalized_data = normalizer.fit_transform(temp_data)
print("Normalized Data:")
pd.DataFrame(normalized_data , columns = temp_data.columns)

Normalized Data:


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,0.0,0.0,0.464043,0.0,0.500000,0.000000,0.000000,0.00,0.036364,0.0,0.0,0.0,0.0,0.0,0.176471,0.0,0.012403,0.0,0.0
1,0.0,0.0,1.000000,0.0,0.500000,0.000000,0.000000,0.00,0.036364,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.012403,0.0,0.0
2,0.0,0.0,0.009498,0.0,0.500000,0.000000,0.000000,0.02,0.018182,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.158210,0.0,0.0
3,0.0,0.0,0.017639,0.0,0.500000,0.000000,0.000000,0.02,0.018182,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.158210,0.0,0.0
4,0.0,0.0,0.018996,0.0,0.500000,0.000000,0.000000,0.04,0.036364,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.202924,0.0,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29259,0.0,0.0,0.523745,0.5,0.807692,0.633333,0.052632,0.06,0.036364,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.107664,0.0,0.0
29260,0.0,0.0,0.104478,0.5,0.807692,0.533333,0.105263,0.10,0.036364,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.031844,0.0,0.0
29261,0.0,0.0,0.181818,0.5,0.807692,0.533333,0.105263,0.10,0.036364,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.206812,0.0,0.4
29262,0.0,0.0,0.298507,0.5,0.807692,0.533333,0.105263,0.10,0.036364,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.153836,0.0,0.0


## Standardization

In [14]:
standard_scaler = StandardScaler()
standardized_data = standard_scaler.fit_transform(temp_data)
pd.DataFrame(standardized_data , columns = temp_data.columns)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,0.0,-0.782409,2.450563,-1.336825,-0.087693,-1.662866,-1.047552,-1.290491,0.162332,-0.290833,-0.115348,-0.226746,-0.089132,-0.160391,3.958982,-0.075475,-1.567540,-0.365193,-0.717463
1,0.0,-0.782409,6.418824,-1.336825,-0.087693,-1.662866,-1.047552,-1.290491,0.162332,-0.290833,-0.115348,-0.226746,-0.089132,-0.160391,5.404323,-0.075475,-1.567540,-0.365193,-0.717463
2,0.0,-0.782409,-0.914925,-1.336825,-0.087693,-1.662866,-1.047552,-0.881988,-1.138814,-0.290833,-0.115348,-0.226746,-0.089132,-0.160391,-0.377041,-0.075475,-0.331520,-0.365193,-0.717463
3,0.0,-0.782409,-0.854647,-1.336825,-0.087693,-1.662866,-1.047552,-0.881988,-1.138814,-0.290833,-0.115348,-0.226746,-0.089132,-0.160391,-0.377041,-0.075475,-0.331520,-0.365193,-0.717463
4,0.0,-0.782409,-0.844601,-1.336825,-0.087693,-1.662866,-1.047552,-0.473484,0.162332,-0.290833,-0.115348,-0.226746,-0.089132,-0.160391,-0.377041,-0.075475,0.047527,-0.365193,0.562147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29259,0.0,-0.782409,2.892597,0.185505,1.109571,0.483038,-0.174075,-0.064980,0.162332,-0.290833,-0.115348,-0.226746,-0.089132,-0.160391,-0.377041,-0.075475,-0.760007,-0.365193,-0.717463
29260,0.0,-0.782409,-0.211689,0.185505,1.109571,0.144211,0.699403,0.752027,0.162332,-0.290833,-0.115348,-0.226746,-0.089132,-0.160391,-0.377041,-0.075475,-1.402738,-0.365193,-0.717463
29261,0.0,-0.782409,0.360946,0.185505,1.109571,0.144211,0.699403,0.752027,0.162332,-0.290833,-0.115348,-0.226746,-0.089132,-0.160391,-0.377041,-0.075475,0.080487,-0.365193,1.841757
29262,0.0,-0.782409,1.224922,0.185505,1.109571,0.144211,0.699403,0.752027,0.162332,-0.290833,-0.115348,-0.226746,-0.089132,-0.160391,-0.377041,-0.075475,-0.368600,-0.365193,-0.717463


# Discretization

In [15]:
from sklearn.preprocessing import KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
data_trans = kbins.fit_transform(temp_data)
print("Data after performing Discretization:")
pd.DataFrame(data_trans,columns = temp_data.columns )

Data after performing Discretization:


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,9.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29259,0.0,0.0,5.0,5.0,8.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
29260,0.0,0.0,1.0,5.0,8.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29261,0.0,0.0,1.0,5.0,8.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,4.0
29262,0.0,0.0,2.0,5.0,8.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Imputation of missing values

In [16]:
data.isnull().sum()

hotel                                 0
is_canceled                           0
lead_time                             0
arrival_date_year                     0
arrival_date_month                    0
arrival_date_week_number              0
arrival_date_day_of_month             0
stays_in_weekend_nights               0
stays_in_week_nights                  0
adults                                0
children                              0
babies                                0
meal                                  0
country                             461
market_segment                        0
distribution_channel                  0
is_repeated_guest                     0
previous_cancellations                0
previous_bookings_not_canceled        0
reserved_room_type                    0
assigned_room_type                    0
booking_changes                       0
deposit_type                          0
agent                              6043
company                           26913


#Simple Imputer

In [17]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan , strategy='mean')
imputed_data = imputer.fit_transform(data['agent'].values.reshape(-1,1))
print("Imputed data:\n",imputed_data)

Imputed data:
 [[214.38034538]
 [214.38034538]
 [214.38034538]
 ...
 [240.        ]
 [243.        ]
 [314.        ]]


In [18]:
pd.DataFrame(imputed_data).isnull().sum()

0    0
dtype: int64