In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('hotel.csv')

##Exploring Data Analysis and Data Cleaning

In [None]:
#number of rows and columns 
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'],errors='coerce')

In [None]:
df.info()

In [None]:
df_clean = df.drop(columns= ['name','email','phone-number','credit_card'], axis= 1)

In [None]:
df.info()

In [None]:
df.describe(include = 'object')

In [None]:
for col in df.describe(include = 'object').columns:
    print(col)
    print(df[col].unique())
    print('-'*50)

In [None]:
df.isnull().sum()

In [None]:
df.drop(['company','agent'], axis = 1, inplace = True)
df.dropna(inplace = True)  #Also removing the missing values from all the columns 

In [None]:
df.isnull().sum()

In [None]:
df.drop(['email','name','phone-number','credit_card'], axis = 1, inplace = True)

In [None]:
df.isnull().sum()

In [None]:
#summary statictics of described column
df.describe()

In [None]:
df= df[df['adr']<50000]

##Data Analysis and Visulaization

In [None]:
# 0 is not camcelled
#1 is cancelled 

cancelled_perc = df['is_canceled'].value_counts(normalize=True)
print(cancelled_perc)

plt.figure(figsize= (5,4))
plt.title('Reservation status caount')
plt.bar(['Not canceled' , 'Canceled'] , df['is_canceled'].value_counts(), edgecolor = 'k' , width = 0.7)
plt.show()

In [None]:
plt.figure(figsize = (4,8)) # 4=width 8=height creating the vertical plot
ax1 = sns.countplot(x = 'hotel' , hue = 'is_canceled', data = df,palette = 'Blues') # core plotting command sns=create bar plot 
legend_labels,_ =ax1.get_legend_handles_labels() #returns a tuple of two lists
ax1.legend(bbox_to_anchor=(1,1)) #upper-right corner of the legend box at the coordinated(1,1) of the axes 
plt.title('Reservation status in different hotels', size =20)
plt.xlabel('hotel')
plt.ylabel('number of reservation')
plt.show()


In [None]:
resort_hotel  = df[df['hotel'] == 'Resort Hotel']
resort_hotel['is_canceled'].value_counts(normalize = True)

In [None]:
city_hotel  = df[df['hotel'] == 'City Hotel']
city_hotel['is_canceled'].value_counts(normalize = True)

In [None]:
resort_hotel = resort_hotel.groupby('reservation_status_date')[['adr']].mean()
city_hotel = city_hotel.groupby('reservation_status_date')[['adr']].mean()

In [None]:
plt.figure(figsize= (20,8))
plt.title('Average Daily Rate in city and Resort Hotel', fontsize = 30)
plt.plot(resort_hotel.index, resort_hotel['adr'], label = 'Resort Hotel')
plt.plot(city_hotel.index, city_hotel['adr'], label = 'city Hotel')
plt.show()

In [None]:
df['month']= df['reservation_status_date'].dt.month
plt.figure(figsize = (14,8))
ax1 = sns.countplot(x ='month', hue = 'is_canceled' , data = df, palette = 'bright' )
legend_labels,_ = ax1.get_legend_handles_labels()
ax1.legend(bbox_to_anchor=(1,1))
plt.title('Reservation status per month' , size = 20)
plt.xlabel('month')
plt.ylabel('number of reservations')
plt.legend(['not canceled', 'cancled'])
plt.show()

In [None]:
plt.figure(figsize = (12,8))
plt.title('ADR per month', fontsize = 30)
monthly_adr_data = df[df['is_canceled']==1].groupby('month')['adr'].sum().reset_index()
sns.barplot(x='month', y='adr',data=monthly_adr_data)
plt.show()
## higher price =higher cancellation 

In [None]:
cancelled_data = df[df['is_canceled'] == 1]
top_10_country = cancelled_data['country'].value_counts()[:10]
plt.figure(figsize = (8,8))
plt.title('Top 10 countries with reservation canceled')
plt.pie(top_10_country, autopct = '%.2f', labels = top_10_country.index)
plt.show()

In [None]:
df['market_segment'].value_counts()

In [None]:
df['market_segment'].value_counts(normalize= True) # most of the customer register online instead of offline

In [None]:
cancelled_data['market_segment'].value_counts(normalize=True) # most are canceled online 1.maybe place should are different than actual 2.facilities are limited  

In [None]:
cancelled_df_adr = cancelled_data.groupby('reservation_status_date')[['adr']].mean()
cancelled_df_adr.reset_index(inplace = True)
cancelled_df_adr.sort_values('reservation_status_date', inplace =True)

not_cancelled_data = df[df['is_canceled'] == 0]
not_cancelled_df_adr = not_cancelled_data.groupby('reservation_status_date')[['adr']].mean()
not_cancelled_df_adr.reset_index(inplace = True)
not_cancelled_df_adr.sort_values('reservation_status_date', inplace=True)

plt.figure(figsize = (20,6))
plt.title('Average Daily Rate')
plt.plot(not_cancelled_df_adr['reservation_status_date'],not_cancelled_df_adr['adr'], label = 'not cancelled')
plt.plot(cancelled_df_adr['reservation_status_date'], cancelled_df_adr['adr'], label = 'cancelled')
plt.legend()

In [None]:
cancelled_df_adr = cancelled_df_adr[(cancelled_df_adr['reservation_status_date']>'2016') & (cancelled_df_adr['reservation_status_date']< '2017-09')]
not_cancelled_df_adr = not_cancelled_df_adr[(not_cancelled_df_adr['reservation_status_date']>'2016') & (not_cancelled_df_adr['reservation_status_date']<'2017-09')]

In [None]:
plt.figure(figsize = (20,6))
plt.title('Average Daily Rate', fontsize = 30)
plt.plot(not_cancelled_df_adr['reservation_status_date'], not_cancelled_df_adr['adr'], label = 'not cancelled')
plt.plot(cancelled_df_adr['reservation_status_date'], cancelled_df_adr['adr'], label = 'cancelled')
plt.legend(fontsize = 20)
plt.show()