In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import plotly.express as px

# for ML
from sklearn.model_selection import cross_val_score, train_test_split, KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
#import eli5

sns.set(style='whitegrid')
pd.set_option('display.max_column', None)

In [4]:
# load the dataset

In [5]:
df_hotels= pd.read_csv('../../datasets/hotel/HotelBookings.csv')

In [6]:
df_hotels.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,01-07-2015
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,01-07-2015
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,02-07-2015
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,02-07-2015
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,03-07-2015


In [8]:
df_hotels.shape

(119390, 32)

In [10]:
# checking for missing values
df_hotels.isnull().sum().sort_values(ascending=False)


company                           112593
agent                              16340
country                              488
children                               4
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
hotel                                  0
previous_cancellations                 0
days_in_waiting_list                   0
customer_type                          0
adr                                    0
required_car_parking_spaces            0
total_of_special_requests              0
reservation_status                     0
previous_bookings_not_canceled         0
is_repeated_guest                      0
is_canceled                            0
distribution_channel                   0
market_segment                         0
meal                                   0
babies                                 0
adults                                 0
stays_in_week_ni

In [12]:
# checking the percentage of missing values 
var_with_na= [var for var in df_hotels.columns if df_hotels[var].isnull().mean() > 0]

In [13]:
df_hotels[var_with_na]

Unnamed: 0,children,country,agent,company
0,0.0,PRT,,
1,0.0,PRT,,
2,0.0,GBR,,
3,0.0,GBR,304.0,
4,0.0,GBR,240.0,
...,...,...,...,...
119385,0.0,BEL,394.0,
119386,0.0,FRA,9.0,
119387,0.0,DEU,9.0,
119388,0.0,GBR,89.0,


In [19]:
data_na= df_hotels[var_with_na].isnull().mean()
data_na=pd.DataFrame(data_na.reset_index())
data_na.columns=['variable', 'percentage']
data_na.sort_values(by='percentage', ascending=False, inplace=True)
data_na

Unnamed: 0,variable,percentage
3,company,0.943069
2,agent,0.136862
1,country,0.004087
0,children,3.4e-05


In [23]:
na_replacement= {"company": 0, 'agent': 0, "children": 0, "country": "Unknown"}

In [24]:
clean_df=df_hotels.fillna(na_replacement)

In [25]:
clean_df[var_with_na]

Unnamed: 0,children,country,agent,company
0,0.0,PRT,0.0,0.0
1,0.0,PRT,0.0,0.0
2,0.0,GBR,0.0,0.0
3,0.0,GBR,304.0,0.0
4,0.0,GBR,240.0,0.0
...,...,...,...,...
119385,0.0,BEL,394.0,0.0
119386,0.0,FRA,9.0,0.0
119387,0.0,DEU,9.0,0.0
119388,0.0,GBR,89.0,0.0


In [26]:
clean_df['meal'].replace('Udefined', 'SC', inplace=True)

In [27]:
zero_quests= list(clean_df.loc[clean_df['adults'] + clean_df['children'] + clean_df['babies']==0].index)

In [28]:
clean_df=clean_df.drop(clean_df.index[zero_quests])

In [29]:
clean_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,0.0,0.0,0,Transient,0.0,0,0,Check-Out,01-07-2015
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,0.0,0.0,0,Transient,0.0,0,0,Check-Out,01-07-2015
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,0.0,0.0,0,Transient,75.0,0,0,Check-Out,02-07-2015
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,0.0,0,Transient,75.0,0,0,Check-Out,02-07-2015
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,0.0,0,Transient,98.0,0,1,Check-Out,03-07-2015


In [30]:
rh=clean_df[(clean_df['hotel']=='Resort Hotel') & (clean_df['is_canceled']==0)]

In [33]:
ch=clean_df[(clean_df['hotel']=='City Hotel') & (clean_df['is_canceled']==0)]

Topics covered and questions to answer from the data:

**Where do the guests come from?
**How much do guests pay for a room per night?
**How does the price per night vary over the year?
**Which are the most busy month?
*How long do people stay at the hotels?
*Bookings by market segment
*How many bookings were canceled?
*Which month have the highest number of cancelations?