In [None]:

# https://plotly.com/python/renderers/
import pandas as pd
import numpy as np
import missingno as msno
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
from matplotlib import pyplot as plt
import utils

In [None]:
colors=px.colors.sequential.RdBu

months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]

In [None]:
df = pd.read_csv('hotels.csv')
print(f'Data set has {df.shape[0]} records and {df.shape[1]} parameters')


In [None]:
map_target_col = {0 : 'not_canceled', 1 : 'canceled'}
df['is_canceled'] = df['is_canceled'].map(map_target_col)
df['is_canceled'] = df['is_canceled'].astype('category')

map_is_repeated_guest_col = {0 : 'new_guest', 1 : 'repeated_guest'}
df['is_repeated_guest'] = df['is_repeated_guest'].map(map_is_repeated_guest_col)
df['is_repeated_guest'] = df['is_repeated_guest'].astype('category')



In [None]:
df.head()

### Missing values
<ul>
<li>'Company' column can be eliminated since it is almost empty (95% of missing values).</li>
<li>'Agent' column has 13% of NaN</li>
<li>'previous_cancellations' and 'country' columns have less than 1% of NaN values</li>
</ul>
Thus, it is accesible to remove records comprising Nan values

In [None]:
msno.matrix(df)

In [None]:
df.isnull().mean().round(4).mul(100).sort_values(ascending=False).head(5)

In [None]:
df = df.drop(['company'], axis=1)
df = df.dropna()

In [None]:
df.info()

#### Target value - <b><i>is_canceled</b></i>
Target value is imbalanced - 75166 records of class '0' and 44224 for class '1'

Ratio 0.588

In [None]:
df.is_canceled.value_counts()


In [None]:
print(f'Classes ratio: {round(df.is_canceled.value_counts()[1]/df.is_canceled.value_counts()[0],3)}')

# EDA 

#### <b>hotel</b> variable - type of a hotel

In [None]:
df.hotel.value_counts()


In [None]:
utils.get_pie_plot(df,'hotel', 'Hotel type')

In [None]:
utils.get_histogram_target_plot(df,'hotel', 'Hotel type')

### Prices by hotel type 
Avarage daily rate by reservation

In [None]:
df['adr_per_person'] = pd.DataFrame(df["adr"] / (df["adults"] + df["children"]))

In [None]:
fig = px.box(df[df['is_canceled'] == 'not_canceled'], y="adr_per_person", x="hotel", color='reserved_room_type',color_discrete_sequence=colors)
fig.update_traces(boxpoints=False) 
fig.update_layout(height=600, width=800, title_text='Price room by hotel type and room type', showlegend=True, yaxis_title="Price")
fig.show()


In [None]:
fig = px.box(df[df['is_canceled'] == 'not_canceled'], y="adr_per_person", x="hotel",color_discrete_sequence=colors)
fig.update_traces(boxpoints=False) 
fig.update_layout(height=600, width=800, title_text='Price room by hotel type', showlegend=True, yaxis_title="Price")
fig.show()

### lead_time
With increasing of booking-arrival interval amount of cancelations is also increasing. Staring from reservation in 250 days advance amount of canceletions exceeds amount of actual arrivals. 

In [None]:
df.lead_time.describe()


In [None]:
utils.get_histogram_plot(df,'lead_time', 'days between booking and arriving',50)

In [None]:
utils.get_histogram_target_plot(df,'lead_time', "days between booking and arriving with respest to cancelation", 50)

In [None]:
utils.get_box_plot(df, 'lead_time', "box plot for lead_time column")

### Dates columns
<ul> 
<li><b>arrival_date_year</b></li>
<li><b>arrival_date_month</b></li>
<li><b>arrival_date_week_number</b></li>
<li><b>arrival_date_day_of_month</b></li>
</ul>



From histogram plots we can observe that summer is most popular season and winter is the slowest one. 

New variable will be created - "Season" with next categories 
<ul> 
<li><b>winter</b></li>
<li><b>spring</b></li>
<li><b>summer</b></li>
<li><b>autumn</b></li>
</ul>


In [None]:
names = ['year', 'month']
df['arrival_date_year'] = df['arrival_date_year'].astype('string').astype('category')
df['arrival_date_month'] = df['arrival_date_month'].astype('category')
utils.get_hist_subplot(df,['arrival_date_year', 'arrival_date_month'], "Years and months Histograms",("Year", "Month"))

In [None]:
utils.get_subplots_with_cancelation(df,['arrival_date_year','arrival_date_month'],"Years and months Histograms",("year", "month"))

In [None]:
utils.get_histogram_target_plot(df, 'arrival_date_month', 'Monthly season')

### Monthly price changes

In [None]:
utils.get_plot_price_monthly(df)

### stays_in_weekend_nights, stays_in_week_nights

In [None]:
utils.get_subplots_with_cancelation(df,['stays_in_weekend_nights','stays_in_week_nights'],"Years and months Histograms",("weekend", "week"))

In [None]:
df['stay_duration'] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]

In [None]:
fig = px.histogram(df[df['is_canceled'] == 'not_canceled'],x='stay_duration', width=800, height=600, barmode='group', 
                   title = 'Length of stay',color='hotel', color_discrete_sequence=colors, nbins=15, histnorm='percent')
fig.update_layout(
    xaxis_title="Duration (days)", yaxis_title="% of booking by hotel type"
)
fig.show()

### Adults, children, babies
Classes are unbalanced, can be eliminated 

In [None]:
df.adults.value_counts()

In [None]:
df.children.value_counts()

In [None]:
df.babies.value_counts()

In [None]:
utils.get_histogram_target_plot(df,'adults', 'Amount of Adults')

In [None]:
utils.get_subplots_with_cancelation(df,['babies','children'], 'Children',('babies','children'))

### meal
<ul>
<li>Undefined/SC – no meal package;</li>
<li>BB – Bed & Breakfast;</li>
<li>HB – Half board (breakfast and one other meal – usually dinner);</li>
<li>FB – Full board (breakfast, lunch and dinner)</li>
</ul>

75.5% of bookings are with included breakfast

In [None]:

utils.get_pie_plot(df,'meal', 'Type of meal booked')

In [None]:
df.groupby(by=['hotel','meal'])['meal'].count()

In [None]:
fig = px.histogram(df[df['is_canceled'] == 'not_canceled'],x='meal', width=800, height=600, barmode='group', 
                   title = 'Meal type vs Hotel type',color='hotel', color_discrete_sequence=colors, nbins=15, histnorm='percent')
fig.update_layout(
    xaxis_title="Duration (days)", yaxis_title="% of reservations by hotel type"
)
fig.show()

### Country 
There are 178 countries in 'country' column. 15 countries occur in 90% of records.

Only in Portugal amount of canceled reservations exceeds amount of completed ones

In [None]:
df.country.value_counts().head(15)

In [None]:
freq_countries = list(df.country.value_counts().head(15).index)
utils.get_histogram_target_plot( df[df['country'].isin(freq_countries)], 'country','Top 15 countries where booking were made', 15)


### market_segment and distribution_channel

In [None]:
utils.get_pie_plot(df, 'market_segment', 'Market segment designation')

In [None]:
utils.get_pie_plot(df, 'distribution_channel', 'Booking distribution channel')


In [None]:
fig = px.histogram(df,x='market_segment', width=800, height=600, barmode='group', 
                   title = 'Cancellation vs Market segment designation',color='is_canceled', color_discrete_sequence=colors, nbins=15, histnorm='percent')
fig.update_layout(
    xaxis_title="Designation", yaxis_title="percents"
)
fig.show()

In [None]:
utils.get_histogram_target_plot( df, 'distribution_channel','TBookings', 15)

In [None]:
utils.get_histogram_target_plot( df, 'market_segment','Market segment designation', 15)

In [None]:
utils.get_subplots_with_cancelation(df,['distribution_channel','market_segment'], 'Market Segments',('Booking distribution channel','Market segment designation'))

### is_repeated_guest, previous_cancellations and previous_bookings_not_canceled
Among bookings by new guests ratio of cancelation is higher than returned customers (39% over 27%)


In [None]:
utils.get_pie_plot(df, 'is_repeated_guest', 'Ratio of repeated customers')


In [None]:

is_repeated_guest =  df.groupby(by=['is_repeated_guest','is_canceled']).agg({'is_canceled': 'count'})
guests_pcts = is_repeated_guest.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum()))
guests_pcts

In [None]:
previous_cancellations =  df.groupby(by=['previous_cancellations','is_canceled']).agg({'is_canceled': 'count'})
cancel_pcts = previous_cancellations.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum()))
cancel_pcts

In [None]:
utils.get_subplots_with_cancelation(df,['previous_cancellations','previous_bookings_not_canceled'],'Number of previous bookings that were canceled or not', ['were canceled', 'were not canceled'])

### reserved_room_type and assigned_room_type

In [None]:
utils.get_subplots_with_cancelation(df,("reserved_room_type", "assigned_room_type"),'Room types: reserved and assigned o the booking',('Reserved type','assigned type'))

In [None]:
reserved_room_type =  df.groupby(by=['reserved_room_type','is_canceled']).agg({'is_canceled': 'count'})
reserved_room_type_pcts = reserved_room_type.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum()))
reserved_room_type_pcts

In [None]:
assigned_room_type =  df.groupby(by=['assigned_room_type','is_canceled']).agg({'is_canceled': 'count'})
assigned_room_type_pcts = assigned_room_type.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum()))
assigned_room_type_pcts

Column with redused room

In [None]:
#df['decreased_room_type'] = np.where(df['reserved_room_type'] < df['assigned_room_type'],1, 0)

In [None]:
utils.get_pie_plot(df.loc[df['reserved_room_type'] < df['assigned_room_type']], 'is_canceled', 'Cancelations with downgraging room type')

In [None]:
utils.get_pie_plot(df.loc[df['reserved_room_type'] > df['assigned_room_type']], 'is_canceled', 'Cancelations with updated room type')

### booking_changes
Number of changes/amendments made to the booking from the moment the booking was entered on the PMS until the moment of check-in or cancellation

In [None]:
utils.get_histogram_plot(df,'booking_changes','Number of changes/amendments made to the booking',20)

In [None]:
utils.get_histogram_target_plot(df,'booking_changes','Number of changes/amendments made to the booking',20)

### deposit_type
Indication on if the customer made a deposit to guarantee the booking. This variable can assume three categories:

In [None]:
utils.get_pie_plot(df,'deposit_type', 'Indication on if the customer made a deposit to guarantee the booking')


In [None]:
utils.get_histogram_target_plot(df,'deposit_type', 'Indication on if the customer made a deposit to guarantee the booking', len(df.deposit_type.unique()))

In [None]:
fig = px.histogram(df[df['is_canceled'] == 'canceled'],x='deposit_type', width=800, height=600, barmode='group', 
                   title = 'Canceled bookings by type of group deposit',color='customer_type', color_discrete_sequence=colors, nbins=15, histnorm='percent')
fig.update_layout(
    xaxis_title="Deposite type", yaxis_title="% of cancellation"
)
fig.show()

In [None]:
deposit_type =  df.groupby(by=['deposit_type','is_canceled']).agg({'is_canceled': 'count'})
deposit_type_pcts = deposit_type.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum()))
deposit_type_pcts

In [None]:
df

### Agent 
Agent ID 



In [None]:
utils.get_histogram_target_plot(df,'agent','ID of the travel agency that made the booking', 400)

In [None]:
df['agent'].value_counts().head(15).sum()

### days_in_waiting_list
Number of days the booking was in the waiting list before it was confirmed to the customer

In [None]:
utils.get_histogram_plot(df,'days_in_waiting_list','Number of days the booking was in the waiting list before it was confirmed', 20)

In [None]:
utils.get_histogram_target_plot(df.loc[df['days_in_waiting_list']<100,],'days_in_waiting_list','Number of days the booking was in the waiting list before it was confirmed', 20)

### customer_type
#### Type of booking, assuming one of four categories:
<ul>
<li> Contract - when the booking has an allotment or other type of contract associated to it</li>
<li> Group – when the booking is associated to a group</li>
<li> Transient – when the booking is not part of a group or contract, and is not associated to other transient booking</li>
<li> Transient-party – when the booking is transient, but is associated to at least other transient booking</li>
</ul>

In [None]:
utils.get_pie_plot(df,'customer_type', 'Type of booking')

In [None]:
utils.get_histogram_target_plot(df,'customer_type', 'Type of booking')

### adr
Average Daily Rate as defined by dividing the sum of all lodging transactions by the total number of staying nights

In [None]:
utils.get_box_plot(df,'adr', 'Average Daily Rate')

In [None]:
fig = px.histogram(df.loc[df['adr']<500,], x="adr", color="is_canceled",
                   marginal="rug", # or violin, rug
                   hover_data=df.columns
                   )
fig.show()

### required_car_parking_spaces
Number of car parking spaces required by the customer



In [None]:
utils.get_pie_plot(df,'required_car_parking_spaces', 'Number of car parking spaces required by the customer')

In [None]:
utils.get_histogram_target_plot(df,'required_car_parking_spaces', 'Number of car parking spaces required by the customer')

### total_of_special_requests
Number of special requests made by the customer (e.g. twin bed or high floor)



In [None]:
utils.get_pie_plot(df,'total_of_special_requests', 'Number of special requests made by the customer (e.g. twin bed or high floor)')

In [None]:
utils.get_histogram_target_plot(df,'total_of_special_requests', 'Number of special requests made by the customer (e.g. twin bed or high floor)')

### reservation_status
#### character
Reservation last status, assuming one of three categories:

<ul>
<li>Canceled – booking was canceled by the customer; </li>
<li>Check-Out – customer has checked in but already departed;</li>
<li>No-Show – customer did not check-in and did inform the hotel of the reason why</li>
</ul>



In [None]:
utils.get_pie_plot(df,'reservation_status', 'Reservation last status, assuming one of three categories:')

#### Explore data for Portugal 
72% of cancelations relates to City hotels

In [None]:
prt_data =  df[df.country =='PRT'].copy()

In [None]:
prt_data.groupby(by=['hotel','is_canceled'])['is_canceled'].count()

In [None]:
fig = px.pie(prt_data,names='hotel', width=600, height=400, title = 'Hotel type in Portugal', color_discrete_sequence=px.colors.sequential.thermal)
fig.show()

In [None]:
prt_data.agent.value_counts()

In [None]:
utils.get_histogram_target_plot(prt_data,'agent','ID of the travel agency that made the booking', 400)