In [1]:
# Initial imports.
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# Load the data
file = ('Resources/clean_hotel_dataset.csv')
booking_df = pd.read_csv(file)
booking_df.reset_index(inplace=True, drop=True)
booking_df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,total_nights
0,INN00001,2,0,1,2,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled,3
1,INN00002,2,0,2,3,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled,5
2,INN00003,1,0,2,1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled,3
3,INN00004,2,0,0,2,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled,2
4,INN00005,2,0,1,1,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled,2


In [3]:
# Integer encoding
le = LabelEncoder()
booking_encoded = booking_df.copy()
booking_encoded["booking_encoded"] = le.fit_transform(booking_encoded["booking_status"])
booking_encoded.head(10)

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,total_nights,booking_encoded
0,INN00001,2,0,1,2,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled,3,1
1,INN00002,2,0,2,3,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled,5,1
2,INN00003,1,0,2,1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled,3,0
3,INN00004,2,0,0,2,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled,2,0
4,INN00005,2,0,1,1,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled,2,0
5,INN00006,2,0,0,2,0,Room_Type 1,346,2018,9,13,Online,0,0,0,115.0,1,Canceled,2,0
6,INN00007,2,0,1,3,0,Room_Type 1,34,2017,10,15,Online,0,0,0,107.55,1,Not_Canceled,4,1
7,INN00008,2,0,1,3,0,Room_Type 4,83,2018,12,26,Online,0,0,0,105.61,1,Not_Canceled,4,1
8,INN00009,3,0,0,4,0,Room_Type 1,121,2018,7,6,Offline,0,0,0,96.9,1,Not_Canceled,4,1
9,INN00010,2,0,0,5,0,Room_Type 4,44,2018,10,18,Online,0,0,0,133.44,3,Not_Canceled,5,1


In [4]:
booking_encoded["room_type_encoded"] = le.fit_transform(booking_encoded["room_type_reserved"])
booking_encoded.head(10)

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,...,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,total_nights,booking_encoded,room_type_encoded
0,INN00001,2,0,1,2,0,Room_Type 1,224,2017,10,...,Offline,0,0,0,65.0,0,Not_Canceled,3,1,0
1,INN00002,2,0,2,3,0,Room_Type 1,5,2018,11,...,Online,0,0,0,106.68,1,Not_Canceled,5,1,0
2,INN00003,1,0,2,1,0,Room_Type 1,1,2018,2,...,Online,0,0,0,60.0,0,Canceled,3,0,0
3,INN00004,2,0,0,2,0,Room_Type 1,211,2018,5,...,Online,0,0,0,100.0,0,Canceled,2,0,0
4,INN00005,2,0,1,1,0,Room_Type 1,48,2018,4,...,Online,0,0,0,94.5,0,Canceled,2,0,0
5,INN00006,2,0,0,2,0,Room_Type 1,346,2018,9,...,Online,0,0,0,115.0,1,Canceled,2,0,0
6,INN00007,2,0,1,3,0,Room_Type 1,34,2017,10,...,Online,0,0,0,107.55,1,Not_Canceled,4,1,0
7,INN00008,2,0,1,3,0,Room_Type 4,83,2018,12,...,Online,0,0,0,105.61,1,Not_Canceled,4,1,3
8,INN00009,3,0,0,4,0,Room_Type 1,121,2018,7,...,Offline,0,0,0,96.9,1,Not_Canceled,4,1,0
9,INN00010,2,0,0,5,0,Room_Type 4,44,2018,10,...,Online,0,0,0,133.44,3,Not_Canceled,5,1,3


In [5]:
booking_encoded["market_segment_encoded"] = le.fit_transform(booking_encoded["market_segment_type"])
booking_encoded.head(10)

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,...,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,total_nights,booking_encoded,room_type_encoded,market_segment_encoded
0,INN00001,2,0,1,2,0,Room_Type 1,224,2017,10,...,0,0,0,65.0,0,Not_Canceled,3,1,0,3
1,INN00002,2,0,2,3,0,Room_Type 1,5,2018,11,...,0,0,0,106.68,1,Not_Canceled,5,1,0,4
2,INN00003,1,0,2,1,0,Room_Type 1,1,2018,2,...,0,0,0,60.0,0,Canceled,3,0,0,4
3,INN00004,2,0,0,2,0,Room_Type 1,211,2018,5,...,0,0,0,100.0,0,Canceled,2,0,0,4
4,INN00005,2,0,1,1,0,Room_Type 1,48,2018,4,...,0,0,0,94.5,0,Canceled,2,0,0,4
5,INN00006,2,0,0,2,0,Room_Type 1,346,2018,9,...,0,0,0,115.0,1,Canceled,2,0,0,4
6,INN00007,2,0,1,3,0,Room_Type 1,34,2017,10,...,0,0,0,107.55,1,Not_Canceled,4,1,0,4
7,INN00008,2,0,1,3,0,Room_Type 4,83,2018,12,...,0,0,0,105.61,1,Not_Canceled,4,1,3,4
8,INN00009,3,0,0,4,0,Room_Type 1,121,2018,7,...,0,0,0,96.9,1,Not_Canceled,4,1,0,3
9,INN00010,2,0,0,5,0,Room_Type 4,44,2018,10,...,0,0,0,133.44,3,Not_Canceled,5,1,3,4


In [6]:
# Remove unwanted columns from features data
y = booking_encoded["booking_encoded"]
X = booking_encoded.drop(columns=["Booking_ID","room_type_reserved","market_segment_type","booking_status","booking_encoded"])
X.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,total_nights,room_type_encoded,market_segment_encoded
0,2,0,1,2,0,224,2017,10,2,0,0,0,65.0,0,3,0,3
1,2,0,2,3,0,5,2018,11,6,0,0,0,106.68,1,5,0,4
2,1,0,2,1,0,1,2018,2,28,0,0,0,60.0,0,3,0,4
3,2,0,0,2,0,211,2018,5,20,0,0,0,100.0,0,2,0,4
4,2,0,1,1,0,48,2018,4,11,0,0,0,94.5,0,2,0,4


In [7]:
X.dtypes

no_of_adults                              int64
no_of_children                            int64
no_of_weekend_nights                      int64
no_of_week_nights                         int64
required_car_parking_space                int64
lead_time                                 int64
arrival_year                              int64
arrival_month                             int64
arrival_date                              int64
repeated_guest                            int64
no_of_previous_cancellations              int64
no_of_previous_bookings_not_canceled      int64
avg_price_per_room                      float64
no_of_special_requests                    int64
total_nights                              int64
room_type_encoded                         int32
market_segment_encoded                    int32
dtype: object

In [8]:
#Split data into test and training
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y)
X_train.shape, X_test.shape

((27206, 17), (9069, 17))

In [9]:
#Create the model
model = SVC(kernel='linear')

In [10]:
#Train the Model
model.fit(X_train, y_train)

SVC(kernel='linear')

In [11]:
#Create Prediction
y_pred = model.predict(X_test)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,1,1
2,1,1
3,1,1
4,0,0
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [12]:
#Calculating Model Accuracy
accuracy_score(y_test, y_pred)

0.8022935274010365

In [13]:
#Confussion Matrix and Clasification Report
matrix = confusion_matrix(y_test, y_pred)
matrix_df = pd.DataFrame(
    matrix, index=["Actual Cancelled", "Actual Not Cancelled"], columns=["Predicted Cancelled", "Predicted Not Cancelled"])
matrix_df


Unnamed: 0,Predicted Cancelled,Predicted Not Cancelled
Actual Cancelled,1924,1047
Actual Not Cancelled,746,5352


In [14]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.72      0.65      0.68      2971
           1       0.84      0.88      0.86      6098

    accuracy                           0.80      9069
   macro avg       0.78      0.76      0.77      9069
weighted avg       0.80      0.80      0.80      9069

