In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

## Loading Data

In [2]:
# Load the data
file = ('Resources/clean_hotel_dataset.csv')
booking_df = pd.read_csv(file)
booking_df.reset_index(inplace=True, drop=True)
booking_df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,total_nights
0,INN00001,2,0,1,2,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled,3
1,INN00002,2,0,2,3,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled,5
2,INN00003,1,0,2,1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled,3
3,INN00004,2,0,0,2,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled,2
4,INN00005,2,0,1,1,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled,2


## Data Preprocessing

In [3]:
# Columns
booking_df.columns

Index(['Booking_ID', 'no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'required_car_parking_space', 'room_type_reserved',
       'lead_time', 'arrival_year', 'arrival_month', 'arrival_date',
       'market_segment_type', 'repeated_guest', 'no_of_previous_cancellations',
       'no_of_previous_bookings_not_canceled', 'avg_price_per_room',
       'no_of_special_requests', 'booking_status', 'total_nights'],
      dtype='object')

In [4]:
# List dataframe data types
booking_df.dtypes

Booking_ID                               object
no_of_adults                              int64
no_of_children                            int64
no_of_weekend_nights                      int64
no_of_week_nights                         int64
required_car_parking_space                int64
room_type_reserved                       object
lead_time                                 int64
arrival_year                              int64
arrival_month                             int64
arrival_date                              int64
market_segment_type                      object
repeated_guest                            int64
no_of_previous_cancellations              int64
no_of_previous_bookings_not_canceled      int64
avg_price_per_room                      float64
no_of_special_requests                    int64
booking_status                           object
total_nights                              int64
dtype: object

In [5]:
# Integer encoding
le = LabelEncoder()
booking_encoded = booking_df.copy()
booking_encoded['booking_status'] = le.fit_transform(booking_encoded['booking_status'])
booking_encoded.head(10)

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,total_nights
0,INN00001,2,0,1,2,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,1,3
1,INN00002,2,0,2,3,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,1,5
2,INN00003,1,0,2,1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,0,3
3,INN00004,2,0,0,2,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,0,2
4,INN00005,2,0,1,1,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,0,2
5,INN00006,2,0,0,2,0,Room_Type 1,346,2018,9,13,Online,0,0,0,115.0,1,0,2
6,INN00007,2,0,1,3,0,Room_Type 1,34,2017,10,15,Online,0,0,0,107.55,1,1,4
7,INN00008,2,0,1,3,0,Room_Type 4,83,2018,12,26,Online,0,0,0,105.61,1,1,4
8,INN00009,3,0,0,4,0,Room_Type 1,121,2018,7,6,Offline,0,0,0,96.9,1,1,4
9,INN00010,2,0,0,5,0,Room_Type 4,44,2018,10,18,Online,0,0,0,133.44,3,1,5


In [6]:
# Remove unwanted columns from features data
y = booking_encoded["booking_status"]
X = booking_encoded.drop(columns=["Booking_ID","room_type_reserved","market_segment_type","booking_status"])
X.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,total_nights
0,2,0,1,2,0,224,2017,10,2,0,0,0,65.0,0,3
1,2,0,2,3,0,5,2018,11,6,0,0,0,106.68,1,5
2,1,0,2,1,0,1,2018,2,28,0,0,0,60.0,0,3
3,2,0,0,2,0,211,2018,5,20,0,0,0,100.0,0,2
4,2,0,1,1,0,48,2018,4,11,0,0,0,94.5,0,2


In [7]:
X.describe()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,total_nights
count,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0
mean,1.844962,0.105279,0.810724,2.2043,0.030986,85.232557,2017.820427,7.423653,15.596995,0.025637,0.023349,0.153411,103.423539,0.619655,3.015024
std,0.518715,0.402648,0.870644,1.410905,0.173281,85.930817,0.383836,3.069894,8.740447,0.158053,0.368331,1.754171,35.089424,0.786236,1.786017
min,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,1.0,0.0,17.0,2018.0,5.0,8.0,0.0,0.0,0.0,80.3,0.0,2.0
50%,2.0,0.0,1.0,2.0,0.0,57.0,2018.0,8.0,16.0,0.0,0.0,0.0,99.45,0.0,3.0
75%,2.0,0.0,2.0,3.0,0.0,126.0,2018.0,10.0,23.0,0.0,0.0,0.0,120.0,1.0,4.0
max,4.0,10.0,7.0,17.0,1.0,443.0,2018.0,12.0,31.0,1.0,13.0,58.0,540.0,5.0,24.0


In [8]:
# Check our target values
y.value_counts()

1    24390
0    11885
Name: booking_status, dtype: int64

 ## Split our Data into Training and Testing

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(27206, 15)

 ## Create a Logistic Regression Model

In [10]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

## Fit (train) or model using the training data

In [11]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

 ## Make predictions

In [12]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,1,1
2,1,1
3,1,1
4,0,0
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [13]:
# Print accuracy score
accuracy_score = accuracy_score(y_test, y_pred)
accuracy_score

0.7821148969015327

## Evaluation Metrics

In [14]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from confusion matrix
matrix_df = pd.DataFrame (matrix, index=[
    "Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"
])
matrix_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1613,1358
Actual 1,618,5480


In [15]:
# Display classification report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.72      0.54      0.62      2971
           1       0.80      0.90      0.85      6098

    accuracy                           0.78      9069
   macro avg       0.76      0.72      0.73      9069
weighted avg       0.78      0.78      0.77      9069



In [34]:
# Display results
print("Confusion Matrix: Logistic Regression")
display(matrix_df)
print(f"Accuracy Score : {accuracy_score}")
print("Classification Report")
print(report)

Confusion Matrix: Logistic Regression


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1613,1358
Actual 1,618,5480


Accuracy Score : 0.7821148969015327
Classification Report
              precision    recall  f1-score   support

           0       0.72      0.54      0.62      2971
           1       0.80      0.90      0.85      6098

    accuracy                           0.78      9069
   macro avg       0.76      0.72      0.73      9069
weighted avg       0.78      0.78      0.77      9069

