# Import Libraris

In [185]:
!pip install evidently
!pip install ydata_profiling
!pip install facets-overview
!pip install holidays



In [186]:
import pandas as pd
import holidays
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # пример модели
from sklearn.metrics import accuracy_score

from IPython.core.display import HTML, display_html
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

from ydata_profiling import ProfileReport

from evidently import Dataset, DataDefinition, Report
from evidently.presets import DataDriftPreset, DataSummaryPreset

# Load data and EDA

In [187]:
df = pd.read_csv('Hotel Reservations.csv')
df.head(3)

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled


In [188]:
profile = ProfileReport(df, title="Hotel Bookings Profiling", minimal=True)

profile.to_file("report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 276.14it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [189]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date   

# Feature Genering

In [190]:
ru_holidays = holidays.RU()

meal_codes = {
    'Not Selected': 0,
    'Meal Plan 1': 1,
    'Meal Plan 2': 2,
    'Meal Plan 3': 3
}

def target_to_int(df):
    df['booking_status'] = (df['booking_status'] == 'Canceled').astype('int64')
    return df

def market_to_int(df):
    df['market_segment_type'] = (df['market_segment_type'] == 'Offline').astype('int64')
    return df

def set_meal_codes(x):
    return meal_codes.get(x, 0)


In [192]:
df = target_to_int(df)

df['date'] = pd.to_datetime({
    'year': df['arrival_year'].astype(int),
    'month': df['arrival_month'].astype(int),
    'day': df['arrival_date'].astype(int)
}, errors='coerce')

df['is_holiday'] = df['date'].isin(ru_holidays).astype(int)
df['is_weekend'] = (df['date'].dt.weekday >= 5).astype(int)

df["type_of_meal_plan"] = df["type_of_meal_plan"].apply(set_meal_codes)
df["room_type_reserved"] = df["room_type_reserved"].apply(lambda x: int(x.split()[-1]))


In [193]:
df.groupby('market_segment_type')['booking_status'].mean()

market_segment_type
Aviation         0.0
Complementary    0.0
Corporate        0.0
Offline          0.0
Online           0.0
Name: booking_status, dtype: float64

In [194]:
df.groupby('market_segment_type')['booking_status'].count()

market_segment_type
Aviation           125
Complementary      391
Corporate         2017
Offline          10528
Online           23214
Name: booking_status, dtype: int64

In [None]:
df = df.sort_values(by='date')


Unnamed: 0,arrival_year,arrival_month,arrival_date
9388,2018,12,31
10759,2018,12,31
10345,2018,12,31
2626,2018,2,29
3677,2018,2,29
5600,2018,2,29
6343,2018,2,29
7648,2018,2,29
8000,2018,2,29
8989,2018,2,29


# prepare df for ML

There are ~2500 corporate, aviation and complementary orders. We will not use overbooking among them

In [176]:
df_not_commercial = df[(df["market_segment_type"]== "Offline") | (df["market_segment_type"]== "Online")]

df_ml = df_not_commercial.drop(columns=["date", "Booking_ID"], axis=1)
df_ml = market_to_int(df_ml)

X = df_ml.drop(columns=["booking_status"], axis=1)
y = df_ml["booking_status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=239
)

model = RandomForestClassifier(random_state=239)

In [None]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Точность модели на тестовых данных: {accuracy:.2f}')

Точность модели на тестовых данных: 0.90


# Calculate historical data

In [178]:
T = 10000
his_df = df[:T]

In [179]:
P_his = his_df['booking_status'].sum() / T
P_his

np.float64(0.3245)

In [180]:
Avg_rev_his = his_df['avg_price_per_room'].sum() / T
Avg_rev_his

np.float64(103.48248799999999)

In [181]:
Loss = P_his * T * Avg_rev_his
Loss

np.float64(335800.67355999997)