In [2]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [8]:
# Loading data
file_path = ('Resources/clean_hotel_dataset.csv')
df_hotel = pd.read_csv(file_path)
df_hotel.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,total_nights
0,INN00001,2,0,1,2,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled,3
1,INN00002,2,0,2,3,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled,5
2,INN00003,1,0,2,1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled,3
3,INN00004,2,0,0,2,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled,2
4,INN00005,2,0,1,1,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled,2


In [9]:
# Define the features set.
X = df_hotel.copy()
X = X = X.drop(["Booking_ID","booking_status","room_type_reserved","market_segment_type"],axis =1)
X.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,total_nights
0,2,0,1,2,0,224,2017,10,2,0,0,0,65.0,0,3
1,2,0,2,3,0,5,2018,11,6,0,0,0,106.68,1,5
2,1,0,2,1,0,1,2018,2,28,0,0,0,60.0,0,3
3,2,0,0,2,0,211,2018,5,20,0,0,0,100.0,0,2
4,2,0,1,1,0,48,2018,4,11,0,0,0,94.5,0,2


In [10]:
y = df_hotel["booking_status"].values.reshape(-1, 1)
y[:5]

array([['Not_Canceled'],
       ['Not_Canceled'],
       ['Canceled'],
       ['Canceled'],
       ['Canceled']], dtype=object)

In [11]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(27206, 15)
(9069, 15)
(27206, 1)
(9069, 1)


In [13]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [14]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(29020, 15)
(7255, 15)
(29020, 1)
(7255, 1)


In [18]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [19]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [20]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [22]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [23]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [24]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2275,698
Actual 1,699,5397


Accuracy Score : 0.8459587606130775
Classification Report
              precision    recall  f1-score   support

    Canceled       0.76      0.77      0.77      2973
Not_Canceled       0.89      0.89      0.89      6096

    accuracy                           0.85      9069
   macro avg       0.83      0.83      0.83      9069
weighted avg       0.85      0.85      0.85      9069

