In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import pickle
import os

# ================= LOAD DATA =================
df = pd.read_csv('hotel.csv')
data = df.copy()

# ================= DATA PREPROCESSING =================
categorical_cols = ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type']
le_dict = {}

# Encode categorical variables
for col in categorical_cols:
    le = LabelEncoder()
    data[col + '_encoded'] = le.fit_transform(data[col])
    le_dict[col] = le

# Target variable
data['cancellation'] = (data['booking_status'] == 'Canceled').astype(int)

# Feature engineering
data['total_nights'] = data['no_of_weekend_nights'] + data['no_of_week_nights']
data['total_guests'] = data['no_of_adults'] + data['no_of_children']

# ================= FEATURE SELECTION =================
feature_cols = [
    'no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights',
    'required_car_parking_space', 'lead_time', 'arrival_year', 'arrival_month', 
    'arrival_date', 'repeated_guest', 'no_of_previous_cancellations',
    'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 'no_of_special_requests',
    'type_of_meal_plan_encoded', 'room_type_reserved_encoded', 'market_segment_type_encoded',
    'total_nights', 'total_guests'
]

X = data[feature_cols]
y = data['cancellation']

# ================= TRAIN-TEST SPLIT =================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ================= MODEL TRAINING =================
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# ================= EVALUATION =================
rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, rf_pred_proba)

print("="*50)
print("Random Forest Model Evaluation")
print("="*50)
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(classification_report(
    y_test, rf_pred,
    target_names=['Not Canceled', 'Canceled']
))

# ================= SAVE MODEL & ENCODERS =================
os.makedirs("model_files", exist_ok=True)

with open("model_files/random_forest_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

with open("model_files/encoders.pkl", "wb") as f:
    pickle.dump(le_dict, f)

with open("model_files/feature_cols.pkl", "wb") as f:
    pickle.dump(feature_cols, f)

print("\nModel, encoders, and feature columns saved successfully.")

# ================= PREDICTION FUNCTION =================
def predict_cancellation(booking_data):
    """
    Predict if a booking will be canceled using the trained Random Forest model.
    """

    # Load model and encoders
    rf_model = pickle.load(open("model_files/random_forest_model.pkl", "rb"))
    encoders = pickle.load(open("model_files/encoders.pkl", "rb"))
    feature_cols = pickle.load(open("model_files/feature_cols.pkl", "rb"))

    df_input = pd.DataFrame([booking_data])

    # Encode categorical features safely
    for col in ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type']:
        if col in df_input.columns:
            value = df_input[col].iloc[0]
            if value in encoders[col].classes_:
                df_input[col + '_encoded'] = encoders[col].transform(df_input[col])
            else:
                df_input[col + '_encoded'] = -1  # unknown category fallback

    # Feature engineering
    df_input['total_nights'] = df_input['no_of_weekend_nights'] + df_input['no_of_week_nights']
    df_input['total_guests'] = df_input['no_of_adults'] + df_input['no_of_children']

    # Arrange feature order
    X_input = df_input[feature_cols]

    probability = rf_model.predict_proba(X_input)[0, 1]

    return {
        'cancellation_probability': probability,
        'prediction': 'Likely to Cancel' if probability > 0.5 else 'Likely to NOT Cancel',
        'risk_level':
            'High' if probability > 0.7 else
            'Medium' if probability > 0.4 else
            'Low'
    }


# ================= SAMPLE TEST =================
sample_booking = {
    'no_of_adults': 2,
    'no_of_children': 0,
    'no_of_weekend_nights': 3,
    'no_of_week_nights': 4,
    'required_car_parking_space': 1,
    'lead_time': 700,
    'arrival_year': 2024,
    'arrival_month': 4,
    'arrival_date': 9,
    'repeated_guest': 0,
    'no_of_previous_cancellations': 0,
    'no_of_previous_bookings_not_canceled': 0,
    'avg_price_per_room': 120,
    'no_of_special_requests': 0,

    # encoded values you provided
    'type_of_meal_plan_encoded': 2,
    'room_type_reserved_encoded': 2,
    'market_segment_type_encoded': 4,

    # calculated values
    'total_nights': 104,
    'total_guests': 2
}

result = predict_cancellation(sample_booking)
print("\nSample Booking Prediction:")
print(result)



Random Forest Model Evaluation
ROC-AUC Score: 0.9539
              precision    recall  f1-score   support

Not Canceled       0.91      0.95      0.93      4878
    Canceled       0.88      0.80      0.84      2377

    accuracy                           0.90      7255
   macro avg       0.89      0.87      0.88      7255
weighted avg       0.90      0.90      0.90      7255


Model, encoders, and feature columns saved successfully.

Sample Booking Prediction:
{'cancellation_probability': np.float64(0.8301488095238095), 'prediction': 'Likely to Cancel', 'risk_level': 'High'}


In [2]:
df['no_of_previous_bookings_not_canceled'].value_counts()

no_of_previous_bookings_not_canceled
0     35463
1       228
2       112
3        80
4        65
5        60
6        36
7        24
8        23
9        19
10       19
11       15
12       12
14        9
15        8
16        7
13        7
17        6
19        6
20        6
18        6
21        6
22        6
25        3
24        3
23        3
27        3
30        2
32        2
44        2
29        2
48        2
28        2
31        2
26        2
53        1
47        1
49        1
34        1
50        1
39        1
33        1
52        1
35        1
37        1
42        1
51        1
38        1
56        1
45        1
55        1
57        1
46        1
43        1
54        1
58        1
41        1
40        1
36        1
Name: count, dtype: int64