In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.colab import drive

import joblib
from tqdm import tqdm

# Load data from CSV file
df = pd.read_excel('train_data.xlsx')

# Convert ArrivalDate column to datetime
df['ArrivalDate'] = pd.to_datetime(df['ArrivalDate'])

# Create a new feature indicating whether LeadTime is greater than 65 or not
df['LongLeadTime'] = (df['LeadTime'] > 65).astype(int)

# Convert ArrivalDate column to datetime
df['ArrivalDate'] = pd.to_datetime(df['ArrivalDate'])

# Extract month from ArrivalDate and create a new column
df['ArrivalMonth'] = df['ArrivalDate'].dt.month

# Create a new feature indicating whether ArrivalMonth is between 4 and 9 or not
df['ArrivalMonth_4_9'] = ((df['ArrivalMonth'] > 4) & (df['ArrivalMonth'] < 9)).astype(int)


# Define features and target variable
X = df.drop(['BookingStatus', 'ArrivalDate'], axis=1)
y = df['BookingStatus']

# Encode categorical features
cat_cols = ['MarketSegment', 'MealPlan', 'RoomType']
for col in cat_cols:
    encoder = LabelEncoder()
    X[col] = encoder.fit_transform(X[col])

# Add features based on relationship with target variable
X['MarketSegment_Online'] = (X['MarketSegment'] == 'Online').astype(int)
X['RoomType_1'] = (X['RoomType'] == 'Room_Type 1').astype(int)
X['WeekendStay'] = (X['NumWeekendNights'] == 0).astype(int)

# Scale AvgRoomPrice and NumWeekNights features to a common scale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[['AvgRoomPrice', 'NumWeekNights']] = scaler.fit_transform(X[['AvgRoomPrice', 'NumWeekNights']])

# Define a random forest model
model = RandomForestClassifier(random_state=42)

# Define hyperparameters for randomized search
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Perform randomized search to find the best hyperparameters
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1)
random_search.fit(X, y)
model = random_search.best_estimator_

# Split data into training and testing sets using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model with the best hyperparameters
for epoch in tqdm(range(1), desc="Training"):
    model.fit(X_train, y_train)

# Make predictions on testing data using predict_proba method
y_prob = model.predict_proba(X_test)
y_pred = (y_prob[:, 1] >= 0.4).astype(int)  # Adjust threshold to improve recall

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)

# Print updated model performance metrics
print('Updated Model Performance Metrics:')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

# Calculate the confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate the true positive, true negative, false positive, and false negative rates
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tpr = tp / (tp + fn)
tnr = tn / (tn + fp)
fpr = fp / (fp + tn)
fnr = fn / (fn + tp)

# Print model performance metrics and confusion matrix
print('Confusion Matrix:')
print(conf_matrix)
print('True Positive Rate:', tpr)
print('True Negative Rate:', tnr)
print('False Positive Rate:', fpr)
print('False Negative Rate:', fnr)

# Save model
joblib.dump(model, 'hotel_cancellation_model.pkl')

Training: 100%|██████████| 1/1 [00:06<00:00,  6.36s/it]


Updated Model Performance Metrics:
Accuracy: 0.8800827015851137
Precision: 0.8103802672147996
Recall: 0.8282563025210085
F1 Score: 0.8192207792207793
Confusion Matrix:
[[3531  369]
 [ 327 1577]]
True Positive Rate: 0.8282563025210085
True Negative Rate: 0.9053846153846153
False Positive Rate: 0.09461538461538462
False Negative Rate: 0.1717436974789916


['hotel_cancellation_model.pkl']