# Import Libraris

In [385]:
# !pip install evidently
# !pip install ydata_profiling
# !pip install facets-overview
# !pip install holidays

In [2]:
import pandas as pd
import holidays
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # пример модели
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score


from IPython.core.display import HTML, display_html
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

from ydata_profiling import ProfileReport

from evidently import Dataset, DataDefinition, Report
from evidently.presets import DataDriftPreset, DataSummaryPreset

# Load data and EDA

In [3]:
df = pd.read_csv('Hotel Reservations.csv')
df.head(3)

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled


In [4]:
profile = ProfileReport(df, title="Hotel Bookings Profiling", minimal=True)

profile.to_file("report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|██████████| 19/19 [00:00<00:00, 4991.97it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
df[["avg_price_per_room", "no_of_weekend_nights", "no_of_week_nights"]].corr()

Unnamed: 0,avg_price_per_room,no_of_weekend_nights,no_of_week_nights
avg_price_per_room,1.0,-0.004525,0.022753
no_of_weekend_nights,-0.004525,1.0,0.179577
no_of_week_nights,0.022753,0.179577,1.0


# Feature Genering

In [6]:
ru_holidays = holidays.RU()

meal_codes = {
    'Not Selected': 0,
    'Meal Plan 1': 1,
    'Meal Plan 2': 2,
    'Meal Plan 3': 3
}

def target_to_int(df):
    df['booking_status'] = (df['booking_status'] == 'Canceled').astype('int64')
    return df

def market_to_int(df):
    df['market_segment_type'] = (df['market_segment_type'] == 'Offline').astype('int64')
    return df

def set_meal_codes(x):
    return meal_codes.get(x, 0)

In [7]:
df = target_to_int(df)

df['date'] = pd.to_datetime({
    'year': df['arrival_year'].astype(int),
    'month': df['arrival_month'].astype(int),
    'day': df['arrival_date'].astype(int)
}, errors='coerce')

df = df[df['date'].notna()]

df['is_holiday'] = df['date'].isin(ru_holidays).astype(int)
df['is_weekend'] = (df['date'].dt.weekday >= 5).astype(int)

df["type_of_meal_plan"] = df["type_of_meal_plan"].apply(set_meal_codes)
df["room_type_reserved"] = df["room_type_reserved"].apply(lambda x: int(x.split()[-1]))


In [8]:
df.groupby('market_segment_type')['booking_status'].mean()

market_segment_type
Aviation         0.296000
Complementary    0.000000
Corporate        0.109398
Offline          0.299677
Online           0.365138
Name: booking_status, dtype: float64

In [9]:
df.groupby('market_segment_type')['booking_status'].count()

market_segment_type
Aviation           125
Complementary      390
Corporate         2011
Offline          10518
Online           23194
Name: booking_status, dtype: int64

In [10]:
df = df.sort_values(by='date')


# ML

There are ~2500 corporate, aviation and complementary orders. We will not use overbooking among them

In [19]:
df_not_commercial = df[(df["market_segment_type"]== "Offline") | (df["market_segment_type"]== "Online")]

df_ml = df_not_commercial.drop(columns=["date", "Booking_ID"], axis=1)
df_ml = market_to_int(df_ml)

X = df_ml.drop(columns=["booking_status"], axis=1)
y = df_ml["booking_status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=239, shuffle=False
)

In [20]:

model = RandomForestClassifier(random_state=239)

In [21]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [22]:
import catboost

def objective(trial):
    model = catboost.CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 100, 3000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 15),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [23]:
from optuna.samplers import TPESampler
import optuna
import catboost

sampler = TPESampler(seed=1)

In [24]:
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

[I 2025-07-31 22:23:28,147] A new study created in memory with name: catboost
[I 2025-07-31 22:23:34,065] Trial 0 finished with value: 0.8220376686934598 and parameters: {'iterations': 1309, 'learning_rate': 0.027583475549166746, 'depth': 4, 'l2_leaf_reg': 1.0551779964424746e-05, 'bootstrap_type': 'Bayesian', 'random_strength': 2.0931628460945333e-07, 'bagging_temperature': 0.923385947687978, 'od_type': 'Iter', 'od_wait': 26}. Best is trial 0 with value: 0.8220376686934598.
[I 2025-07-31 22:24:13,746] Trial 1 finished with value: 0.8202580453803945 and parameters: {'iterations': 1663, 'learning_rate': 0.006892694481137703, 'depth': 12, 'l2_leaf_reg': 1.10795595820296e-06, 'bootstrap_type': 'Bayesian', 'random_strength': 0.79993910451721, 'bagging_temperature': 0.27387593197926163, 'od_type': 'IncToDec', 'od_wait': 32}. Best is trial 0 with value: 0.8220376686934598.
[W 2025-07-31 22:24:31,436] Trial 2 failed with parameters: {'iterations': 507, 'learning_rate': 0.002490020818620744, 'd

KeyboardInterrupt: 

# Scoring model

In [14]:
def calculate_profit(TP, TN, FP, FN, AR):
    CO = 1.2 * AR  # стоимость избыточного бронирования
    PP = 0.4  # доля предоплаты
    
    profit = TP * PP * AR + TN * AR - FP * CO
    return profit


In [15]:
def score(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')

    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    AR = X_train['avg_price_per_room'].mean()

    profit = calculate_profit(TP, TN, FP, FN, AR)
    print(f"Profit: {profit:.0f}")
    return profit
    

print("Model evaluation:")
ml_profit = score(y_test, y_pred)

print("\nZero predictor baseline evaluation:")
y_pred[:] = 0
score(y_test, y_pred)

print("\nOne predictor baseline evaluation:")
y_pred[:] = 1
score(y_test, y_pred)

print("\nRandom predictor evaluation:")
y_pred[:] = np.random.randint(0, 2, size=len(y_test))
score(y_test, y_pred)


Model evaluation:
Accuracy: 0.91
Profit: 510987

Zero predictor baseline evaluation:
Accuracy: 0.66
Profit: 543491

One predictor baseline evaluation:
Accuracy: 0.34
Profit: -466896

Random predictor evaluation:
Accuracy: 0.50
Profit: 28405


np.float64(28405.234640884002)

# Calculate historical data

In [413]:
def score_losses(df):
    return int(df['avg_price_per_room'].mean() * len(df) * df["booking_status"].mean())

losses = score_losses(pd.concat([X_test, y_test], axis=1))

usual_profit = int(X_test['avg_price_per_room'].sum()) - losses
print(f"Прибыль без овербукинга на test: {usual_profit}")
print(f"Прибыль с овербукингом на test: {ml_profit}")
print(f"Мы увеличили прибыль на {ml_profit - usual_profit}, те на {round((ml_profit / usual_profit - 1) * 100, 2)}% ")

Прибыль без овербукинга на test: 468566
Прибыль с овербукингом на test: 510987.08266917575
Мы увеличили прибыль на 42421.082669175754, те на 9.05% 


In [401]:
T = 10000
his_df = df[:T]

In [402]:
P_his = his_df['booking_status'].sum() / T
P_his

np.float64(0.1615)

In [403]:
Avg_rev_his = his_df['avg_price_per_room'].sum() / T
Avg_rev_his

np.float64(86.25557)

In [404]:
Loss = P_his * T * Avg_rev_his
Loss

np.float64(139302.74555000002)