In [1]:
# Initial imports.
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
file = ('Resources/clean_hotel_dataset.csv')
model_df = pd.read_csv(file)
model_df.reset_index(inplace=True, drop=True)
model_df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,total_nights
0,INN00001,2,0,1,2,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled,3
1,INN00002,2,0,2,3,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled,5
2,INN00003,1,0,2,1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled,3
3,INN00004,2,0,0,2,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled,2
4,INN00005,2,0,1,1,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled,2


In [3]:
# Define the features set.
X = model_df.copy()
X = X.drop(["Booking_ID","booking_status","room_type_reserved","market_segment_type"],axis =1)
X.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,total_nights
0,2,0,1,2,0,224,2017,10,2,0,0,0,65.0,0,3
1,2,0,2,3,0,5,2018,11,6,0,0,0,106.68,1,5
2,1,0,2,1,0,1,2018,2,28,0,0,0,60.0,0,3
3,2,0,0,2,0,211,2018,5,20,0,0,0,100.0,0,2
4,2,0,1,1,0,48,2018,4,11,0,0,0,94.5,0,2


In [4]:
# Define the target set.
y = model_df["booking_status"].values
y[:5]

array(['Not_Canceled', 'Not_Canceled', 'Canceled', 'Canceled', 'Canceled'],
      dtype=object)

In [5]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [6]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [8]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [9]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [10]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Cancelled", "Actual Not Cancelled"], columns=["Predicted Cancelled", "Predicted Not Cancelled"])

cm_df

Unnamed: 0,Predicted Cancelled,Predicted Not Cancelled
Actual Cancelled,2269,704
Actual Not Cancelled,312,5784


In [11]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [12]:
# Displaying results
print("Confusion Matrix Random Forest")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix Random Forest


Unnamed: 0,Predicted Cancelled,Predicted Not Cancelled
Actual Cancelled,2269,704
Actual Not Cancelled,312,5784


Accuracy Score : 0.8879700077186018
Classification Report
              precision    recall  f1-score   support

    Canceled       0.88      0.76      0.82      2973
Not_Canceled       0.89      0.95      0.92      6096

    accuracy                           0.89      9069
   macro avg       0.89      0.86      0.87      9069
weighted avg       0.89      0.89      0.89      9069

