# **Data Preprocessing**

In [3]:
#importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
#importing dataset
data = pd.read_csv('hotel_bookings.csv')
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [5]:
#Observations in the dataset
print("Total count of samples:\n",data['hotel'].value_counts())
print(data.count())

Total count of samples:
 City Hotel      79330
Resort Hotel    40060
Name: hotel, dtype: int64
hotel                             119390
is_canceled                       119390
lead_time                         119390
arrival_date_year                 119390
arrival_date_month                119390
arrival_date_week_number          119390
arrival_date_day_of_month         119390
stays_in_weekend_nights           119390
stays_in_week_nights              119390
adults                            119390
children                          119386
babies                            119390
meal                              119390
country                           118902
market_segment                    119390
distribution_channel              119390
is_repeated_guest                 119390
previous_cancellations            119390
previous_bookings_not_canceled    119390
reserved_room_type                119390
assigned_room_type                119390
booking_changes                   119390
dep

## **Encoding**

In [6]:
# Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['hotel']=le.fit_transform(data['hotel'])
print("Count of samples in each hotel after encoding:\n",data['hotel'].value_counts())
print("Hotels: ",le.classes_)
data

Count of samples in each hotel after encoding:
 0    79330
1    40060
Name: hotel, dtype: int64
Hotels:  ['City Hotel' 'Resort Hotel']


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,1,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,1,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,1,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,1,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,1,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
5,1,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
6,1,0,0,2015,July,27,1,0,2,2,...,No Deposit,,,0,Transient,107.00,0,0,Check-Out,2015-07-03
7,1,0,9,2015,July,27,1,0,2,2,...,No Deposit,303.0,,0,Transient,103.00,0,1,Check-Out,2015-07-03
8,1,1,85,2015,July,27,1,0,3,2,...,No Deposit,240.0,,0,Transient,82.00,0,1,Canceled,2015-05-06
9,1,1,75,2015,July,27,1,0,3,2,...,No Deposit,15.0,,0,Transient,105.50,0,0,Canceled,2015-04-22


In [7]:
# One hot Encoder
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
transformed_data = encoder.fit_transform(data['hotel'].values.reshape(-1,1)).toarray()
print("Encoded Categories: ",encoder.categories_)

#Transforming the encoded data to dataframe
transformed_data = pd.DataFrame(transformed_data , columns = ['Contract', 'Group'])
transformed_data

Encoded Categories:  [array([0, 1])]


Unnamed: 0,Contract,Group
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
5,0.0,1.0
6,0.0,1.0
7,0.0,1.0
8,0.0,1.0
9,0.0,1.0


## **Normalization and Standardization**

In [8]:
#considering only numeric columns
numeric_columns = [c for c in data.columns if data[c].dtype != np.dtype('O')]
numeric_columns.remove('company')
numeric_columns.remove('agent')
temp_data = data[numeric_columns]
temp_data

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,1,0,342,2015,27,1,0,0,2,0.0,0,0,0,0,3,0,0.00,0,0
1,1,0,737,2015,27,1,0,0,2,0.0,0,0,0,0,4,0,0.00,0,0
2,1,0,7,2015,27,1,0,1,1,0.0,0,0,0,0,0,0,75.00,0,0
3,1,0,13,2015,27,1,0,1,1,0.0,0,0,0,0,0,0,75.00,0,0
4,1,0,14,2015,27,1,0,2,2,0.0,0,0,0,0,0,0,98.00,0,1
5,1,0,14,2015,27,1,0,2,2,0.0,0,0,0,0,0,0,98.00,0,1
6,1,0,0,2015,27,1,0,2,2,0.0,0,0,0,0,0,0,107.00,0,0
7,1,0,9,2015,27,1,0,2,2,0.0,0,0,0,0,0,0,103.00,0,1
8,1,1,85,2015,27,1,0,3,2,0.0,0,0,0,0,0,0,82.00,0,1
9,1,1,75,2015,27,1,0,3,2,0.0,0,0,0,0,0,0,105.50,0,0


## **Normalization**

In [9]:
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
normalizer = MinMaxScaler()
temp_data.dropna(axis = 1 , inplace = True)
normalized_data = normalizer.fit_transform(temp_data)
print("Normalized Data:")
pd.DataFrame(normalized_data , columns = temp_data.columns)

Normalized Data:


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,1.0,0.0,0.464043,0.0,0.500000,0.000000,0.000000,0.00,0.036364,0.0,0.0,0.0,0.0,0.142857,0.0,0.001180,0.000,0.0
1,1.0,0.0,1.000000,0.0,0.500000,0.000000,0.000000,0.00,0.036364,0.0,0.0,0.0,0.0,0.190476,0.0,0.001180,0.000,0.0
2,1.0,0.0,0.009498,0.0,0.500000,0.000000,0.000000,0.02,0.018182,0.0,0.0,0.0,0.0,0.000000,0.0,0.015053,0.000,0.0
3,1.0,0.0,0.017639,0.0,0.500000,0.000000,0.000000,0.02,0.018182,0.0,0.0,0.0,0.0,0.000000,0.0,0.015053,0.000,0.0
4,1.0,0.0,0.018996,0.0,0.500000,0.000000,0.000000,0.04,0.036364,0.0,0.0,0.0,0.0,0.000000,0.0,0.019307,0.000,0.2
5,1.0,0.0,0.018996,0.0,0.500000,0.000000,0.000000,0.04,0.036364,0.0,0.0,0.0,0.0,0.000000,0.0,0.019307,0.000,0.2
6,1.0,0.0,0.000000,0.0,0.500000,0.000000,0.000000,0.04,0.036364,0.0,0.0,0.0,0.0,0.000000,0.0,0.020972,0.000,0.0
7,1.0,0.0,0.012212,0.0,0.500000,0.000000,0.000000,0.04,0.036364,0.0,0.0,0.0,0.0,0.000000,0.0,0.020232,0.000,0.2
8,1.0,1.0,0.115332,0.0,0.500000,0.000000,0.000000,0.06,0.036364,0.0,0.0,0.0,0.0,0.000000,0.0,0.016347,0.000,0.2
9,1.0,1.0,0.101764,0.0,0.500000,0.000000,0.000000,0.06,0.036364,0.0,0.0,0.0,0.0,0.000000,0.0,0.020694,0.000,0.0


## **Standardization**

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
standardized_data = sc.fit_transform(temp_data)
pd.DataFrame(standardized_data , columns = temp_data.columns)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,1.407224,-0.767040,2.227051,-1.634768,-0.012141,-1.685297,-0.928890,-1.310240,0.247897,-0.081579,-0.18156,-0.10318,-0.091555,4.260101,-0.131924,-2.015038,-0.254873,-0.720694
1,1.407224,-0.767040,5.923385,-1.634768,-0.012141,-1.685297,-0.928890,-1.310240,0.247897,-0.081579,-0.18156,-0.10318,-0.091555,5.793131,-0.131924,-2.015038,-0.254873,-0.720694
2,1.407224,-0.767040,-0.907814,-1.634768,-0.012141,-1.685297,-0.928890,-0.786207,-1.478447,-0.081579,-0.18156,-0.10318,-0.091555,-0.338990,-0.131924,-0.530935,-0.254873,-0.720694
3,1.407224,-0.767040,-0.851667,-1.634768,-0.012141,-1.685297,-0.928890,-0.786207,-1.478447,-0.081579,-0.18156,-0.10318,-0.091555,-0.338990,-0.131924,-0.530935,-0.254873,-0.720694
4,1.407224,-0.767040,-0.842309,-1.634768,-0.012141,-1.685297,-0.928890,-0.262174,0.247897,-0.081579,-0.18156,-0.10318,-0.091555,-0.338990,-0.131924,-0.075810,-0.254873,0.540666
5,1.407224,-0.767040,-0.842309,-1.634768,-0.012141,-1.685297,-0.928890,-0.262174,0.247897,-0.081579,-0.18156,-0.10318,-0.091555,-0.338990,-0.131924,-0.075810,-0.254873,0.540666
6,1.407224,-0.767040,-0.973319,-1.634768,-0.012141,-1.685297,-0.928890,-0.262174,0.247897,-0.081579,-0.18156,-0.10318,-0.091555,-0.338990,-0.131924,0.102282,-0.254873,-0.720694
7,1.407224,-0.767040,-0.889098,-1.634768,-0.012141,-1.685297,-0.928890,-0.262174,0.247897,-0.081579,-0.18156,-0.10318,-0.091555,-0.338990,-0.131924,0.023130,-0.254873,0.540666
8,1.407224,1.303712,-0.177905,-1.634768,-0.012141,-1.685297,-0.928890,0.261858,0.247897,-0.081579,-0.18156,-0.10318,-0.091555,-0.338990,-0.131924,-0.392419,-0.254873,0.540666
9,1.407224,1.303712,-0.271483,-1.634768,-0.012141,-1.685297,-0.928890,0.261858,0.247897,-0.081579,-0.18156,-0.10318,-0.091555,-0.338990,-0.131924,0.072600,-0.254873,-0.720694


## **Discretization**

In [11]:
from sklearn.preprocessing import KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
data_trans = kbins.fit_transform(temp_data)
print("Data after performing Discretization:")
pd.DataFrame(data_trans,columns = temp_data.columns )

Data after performing Discretization:


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,9.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,9.0,0.0,9.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,9.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
5,9.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
6,9.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,9.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
8,9.0,9.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
9,9.0,9.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Imputation of missing values**

In [12]:
#Checking null values in dataset
data.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [13]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan , strategy='mean')
imputed_data = imputer.fit_transform(data['agent'].values.reshape(-1,1))
print("Imputed data:\n",imputed_data)

Imputed data:
 [[86.69338185]
 [86.69338185]
 [86.69338185]
 ...
 [ 9.        ]
 [89.        ]
 [ 9.        ]]


In [14]:
#Check if any null values after imputation of missing values
pd.DataFrame(imputed_data).isnull().sum()

0    0
dtype: int64