In [23]:
#Author:  Anthony Tugman
#Title:   Predicting Hotel Reservation Cancellation Rates
#Course:   E599 - Big Data
#Description: Implementation of a basic pythonic framework for predicting hotel reservation cancellation rates

In [None]:
pip install cloudmesh-common -U

In [25]:
#Import required libraries and dependencies
from cloudmesh.common.StopWatch import StopWatch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import *
from matplotlib import pyplot as plt
import seaborn as sns
import pydot

In [26]:
#Import dataset into pandas dataframe
StopWatch.start("Code Execution")
StopWatch.start("Data Load")
df = pd.read_csv("https://raw.githubusercontent.com/cybertraining-dsc/fa20-523-323/main/project/dataset/hotel_bookings.csv")
StopWatch.stop("Data Load")
StopWatch.status("Data Load", True)

In [None]:
#Determine initial attributes of data
StopWatch.start("Prepare Data")
print("shape: ",df.shape)
print("duplicate: ",df.duplicated().sum())

shape:  (119390, 32)
duplicate:  31994



In [28]:
#Remove categories
df.drop(['country', 'agent', 'babies', 'company', 'children',
        'reservation_status_date'],axis=1, inplace=True)

#Remove duplicates
df.drop_duplicates(inplace=True)

In [29]:
#Determine if room need was met
#Create new category
df['room_correct'] = (df['reserved_room_type'] == df['assigned_room_type'])
df.drop(['reserved_room_type', 'assigned_room_type'],axis=1, inplace=True)

In [None]:
#Convert strings to numerical values
df = df.replace(['City Hotel', 'HB', 'Online TA', 'TA/TO', 'No Deposit', 
                 'Transient', 'Check-Out'],'0')
df = df.replace(['Resort Hotel', 'January', 'BB', 'Offline TA/TO', 'GDS', 
                 'Non Refund', 'Transient-Party', 'Canceled'],'1')
df = df.replace(['February', 'SC', 'Groups', 'Refundable', 'Group', 
                 'No-Show'],'2')
df = df.replace(['March', 'FB', 'Direct', 'Contract'],'3')
df = df.replace(['April', 'Undefined', 'Corporate'],'4')
df = df.replace(['May', 'Complementary'], '5')
df = df.replace(['June', 'Aviation'], '6')
df = df.replace(['July'], '7')
df = df.replace(['August'], '8')
df = df.replace(['September'], '9')
df = df.replace(['October'], '10')
df = df.replace(['November'], '11')
df = df.replace(['December'], '12')
df['room_correct'] = df['room_correct'].astype(int)

#Remove duplicate and null values
df.drop_duplicates(inplace=True)
df.dropna()

In [None]:
#Determine data attributes after processing
print("shape: ",df.shape)
print("duplicate: ",df.duplicated().sum())
StopWatch.stop("Prepare Data")
StopWatch.status("Prepare Data", True)

In [None]:
X=df[['hotel', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'arrival_date_day_of_month',
      'stays_in_weekend_nights', 'stays_in_week_nights', 'adults',  'meal', 'market_segment', 'distribution_channel',
      'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'deposit_type',
      'days_in_waiting_list', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'room_correct']]

#labels
Y=df[['is_canceled']]

#split into training and testing sets
StopWatch.start("Generate Model 1")
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)
StopWatch.stop("Generate Model 1")
StopWatch.status("Generate Model 1", True)
count = y_pred.sum()
count1 = y_test.sum()
accuracy = (count/count1) * 100
print(accuracy)

In [None]:
importance = classifier.feature_importances_
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.2f' % (i,v))

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
cor_target = abs(cor["is_canceled"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.1]
relevant_features

In [None]:
#Remove unnecessary features
X1=df[['lead_time', 'adr', 'room_correct']]
Y=df[['is_canceled']]

#split into training and testing sets

StopWatch.start("Generate Model 2")
X_train, X_test, y_train, y_test = train_test_split(X1, Y, test_size=0.4)
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)
StopWatch.stop("Generate Model 2")
StopWatch.status("Generate Model 2", True)
count = y_pred.sum()
count1 = y_test.sum()
accuracy = (count/count1) * 100
print(accuracy)

In [None]:
#Using Pearson Correlation to show final correlation
X2=df[['lead_time', 'adr',  'total_of_special_requests', 'room_correct', 'is_canceled']]
plt.figure(figsize=(12,10))
cor = X2.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
#Graph Predicted vs. Actual Values
plt.figure(figsize=(12, 10))
ax = sns.distplot(y_test, hist=False, color="r", label="Actual Values")
sns.distplot(y_pred, hist=False, color="b", label="Predicted Values" , ax=ax)
plt.title('Actual vs Predicted Values for Cancellation')
plt.show()
plt.close()

In [None]:
StopWatch.stop("Code Execution")
StopWatch.status("Code Execution", True)
StopWatch.benchmark()