# Import Libraris

In [333]:
# !pip install evidently
# !pip install ydata_profiling
# !pip install facets-overview
# !pip install holidays

In [316]:
import pandas as pd
import holidays
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # пример модели
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score


from IPython.core.display import HTML, display_html
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

from ydata_profiling import ProfileReport

from evidently import Dataset, DataDefinition, Report
from evidently.presets import DataDriftPreset, DataSummaryPreset

# Load data and EDA

In [317]:
df = pd.read_csv('Hotel Reservations.csv')
df.head(3)

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled


In [318]:
profile = ProfileReport(df, title="Hotel Bookings Profiling", minimal=True)

profile.to_file("report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 8038.31it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [319]:
df[["avg_price_per_room", "no_of_weekend_nights", "no_of_week_nights"]].corr()

Unnamed: 0,avg_price_per_room,no_of_weekend_nights,no_of_week_nights
avg_price_per_room,1.0,-0.004525,0.022753
no_of_weekend_nights,-0.004525,1.0,0.179577
no_of_week_nights,0.022753,0.179577,1.0


# Feature Genering

In [320]:
ru_holidays = holidays.RU()

meal_codes = {
    'Not Selected': 0,
    'Meal Plan 1': 1,
    'Meal Plan 2': 2,
    'Meal Plan 3': 3
}

def target_to_int(df):
    df['booking_status'] = (df['booking_status'] == 'Canceled').astype('int64')
    return df

def market_to_int(df):
    df['market_segment_type'] = (df['market_segment_type'] == 'Offline').astype('int64')
    return df

def set_meal_codes(x):
    return meal_codes.get(x, 0)


In [321]:
df = target_to_int(df)

df['date'] = pd.to_datetime({
    'year': df['arrival_year'].astype(int),
    'month': df['arrival_month'].astype(int),
    'day': df['arrival_date'].astype(int)
}, errors='coerce')

df = df[df['date'].notna()]

df['is_holiday'] = df['date'].isin(ru_holidays).astype(int)
df['is_weekend'] = (df['date'].dt.weekday >= 5).astype(int)

df["type_of_meal_plan"] = df["type_of_meal_plan"].apply(set_meal_codes)
df["room_type_reserved"] = df["room_type_reserved"].apply(lambda x: int(x.split()[-1]))


In [322]:
df.groupby('market_segment_type')['booking_status'].mean()

market_segment_type
Aviation         0.296000
Complementary    0.000000
Corporate        0.109398
Offline          0.299677
Online           0.365138
Name: booking_status, dtype: float64

In [323]:
df.groupby('market_segment_type')['booking_status'].count()

market_segment_type
Aviation           125
Complementary      390
Corporate         2011
Offline          10518
Online           23194
Name: booking_status, dtype: int64

In [324]:
df = df.sort_values(by='date')


# ML

There are ~2500 corporate, aviation and complementary orders. We will not use overbooking among them

In [325]:
df_not_commercial = df[(df["market_segment_type"]== "Offline") | (df["market_segment_type"]== "Online")]

df_ml = df_not_commercial.drop(columns=["date", "Booking_ID"], axis=1)
df_ml = market_to_int(df_ml)

X = df_ml.drop(columns=["booking_status"], axis=1)
y = df_ml["booking_status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=239
)

In [None]:

model = RandomForestClassifier(random_state=239)

In [326]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Scoring model

In [327]:
def calculate_profit(TP, TN, FP, FN, AR):
    CO = 0.8 * AR  # стоимость избыточного бронирования
    PC = 1.2 * AR  # штрафные издержки
    PP = 0.4  # доля предоплаты
    
    profit = TP * PP * AR + TN * AR - FP * CO - FN * PC
    return profit


In [328]:
def score(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')

    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    AR = X_train['avg_price_per_room'].mean()

    profit = calculate_profit(TP, TN, FP, FN, AR)
    print(f"Profit: {profit:.0f}")
    

print("Model evaluation:")
score(y_test, y_pred)

print("\nZero predictor baseline evaluation:")
y_pred[:] = 0
score(y_test, y_pred)

print("\nOne predictor baseline evaluation:")
y_pred[:] = 1
score(y_test, y_pred)

print("\nRandom predictor evaluation:")
y_pred[:] = np.random.randint(0, 2, size=len(y_test))
score(y_test, y_pred)


Model evaluation:
Accuracy: 0.91
Profit: 457174

Zero predictor baseline evaluation:
Accuracy: 0.66
Profit: 177782

One predictor baseline evaluation:
Accuracy: 0.34
Profit: -278757

Random predictor evaluation:
Accuracy: 0.49
Profit: -59246


# Calculate historical data

In [329]:
T = 10000
his_df = df[:T]

In [330]:
P_his = his_df['booking_status'].sum() / T
P_his

np.float64(0.1615)

In [331]:
Avg_rev_his = his_df['avg_price_per_room'].sum() / T
Avg_rev_his

np.float64(86.25557)

In [332]:
Loss = P_his * T * Avg_rev_his
Loss

np.float64(139302.74555000002)