# Import Libraris

In [160]:
# !pip install evidently
# !pip install ydata_profiling
# !pip install facets-overview
# !pip install holidays
!pip install catboost



In [161]:
import pandas as pd
import holidays
import numpy as np
import mlflow
from mlflow import catboost

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score


from IPython.core.display import HTML, display_html
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

from ydata_profiling import ProfileReport

from evidently import Dataset, DataDefinition, Report
from evidently.presets import DataDriftPreset, DataSummaryPreset

# Load data and EDA

In [162]:
df = pd.read_csv('Hotel Reservations.csv')
df.head(3)

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled


In [163]:
profile = ProfileReport(df, title="Hotel Bookings Profiling", minimal=True)

profile.to_file("report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 63.61it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [164]:
df[["avg_price_per_room", "no_of_weekend_nights", "no_of_week_nights"]].corr()

Unnamed: 0,avg_price_per_room,no_of_weekend_nights,no_of_week_nights
avg_price_per_room,1.0,-0.004525,0.022753
no_of_weekend_nights,-0.004525,1.0,0.179577
no_of_week_nights,0.022753,0.179577,1.0


# Feature Genering

In [165]:
ru_holidays = holidays.RU()

meal_codes = {
    'Not Selected': 0,
    'Meal Plan 1': 1,
    'Meal Plan 2': 2,
    'Meal Plan 3': 3
}

def target_to_int(df):
    df['booking_status'] = (df['booking_status'] == 'Canceled').astype('int64')
    return df

def market_to_int(df):
    df['market_segment_type'] = (df['market_segment_type'] == 'Offline').astype('int64')
    return df

def set_meal_codes(x):
    return meal_codes.get(x, 0)

In [166]:
df = target_to_int(df)

df['date'] = pd.to_datetime({
    'year': df['arrival_year'].astype(int),
    'month': df['arrival_month'].astype(int),
    'day': df['arrival_date'].astype(int)
}, errors='coerce')

df = df[df['date'].notna()]

df['is_holiday'] = df['date'].isin(ru_holidays).astype(int)
df['is_weekend'] = (df['date'].dt.weekday >= 5).astype(int)

df["type_of_meal_plan"] = df["type_of_meal_plan"].apply(set_meal_codes)
df["room_type_reserved"] = df["room_type_reserved"].apply(lambda x: int(x.split()[-1]))


In [167]:
df.groupby('market_segment_type')['booking_status'].mean()

market_segment_type
Aviation         0.296000
Complementary    0.000000
Corporate        0.109398
Offline          0.299677
Online           0.365138
Name: booking_status, dtype: float64

In [168]:
df.groupby('market_segment_type')['booking_status'].count()

market_segment_type
Aviation           125
Complementary      390
Corporate         2011
Offline          10518
Online           23194
Name: booking_status, dtype: int64

In [169]:
df = df.sort_values(by='date')

# ML

There are ~2500 corporate, aviation and complementary orders. We will not use overbooking among them

In [170]:
df_not_commercial = df[(df["market_segment_type"]== "Offline") | (df["market_segment_type"]== "Online")]

df_ml = df_not_commercial.drop(columns=["date", "Booking_ID"], axis=1)
df_ml = market_to_int(df_ml)

X = df_ml.drop(columns=["booking_status"], axis=1)
y = df_ml["booking_status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=239, shuffle=False
)

In [171]:
from catboost import CatBoostClassifier 

params = {'iterations': 162,
 'learning_rate': 0.020659576807258582,
 'depth': 8,
 'l2_leaf_reg': 0.8679838253640907,
 'bootstrap_type': 'Bayesian',
 'random_strength': 1.5305950013403581e-06,
 'bagging_temperature': 0.5829113245739789,
 'od_type': 'IncToDec',
 'od_wait': 40}


# model = CatBoostClassifier(**params)
model = RandomForestClassifier(n_estimators=100, random_state=239)

In [172]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.83


# Scoring model

In [173]:
def calculate_profit(TP, TN, FP, FN, AR):
    CO = 1.2 * AR  # стоимость избыточного бронирования
    PP = 0.8  # доля предоплаты
    
    profit = TP * PP * AR + TN * AR - FP * CO
    return profit


In [174]:
def score(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')

    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    TN, FP, FN, TP = cm.ravel()
    AR = X_train['avg_price_per_room'].mean()

    profit = calculate_profit(TP, TN, FP, FN, AR)
    print(f"Profit: {profit:.0f}")
    return profit
    

print("Model evaluation:")
ml_profit = score(y_test, y_pred)

print("\nOne predictor baseline evaluation:")
y_pred[:] = 1
score(y_test, y_pred)

print("\nRandom predictor evaluation:")
y_pred[:] = np.random.randint(0, 2, size=len(y_test))
score(y_test, y_pred)


Model evaluation:
Accuracy: 0.83
[[4010  275]
 [ 903 1555]]
Profit: 523782

One predictor baseline evaluation:
Accuracy: 0.36
[[   0 4285]
 [   0 2458]]
Profit: -337799

Random predictor evaluation:
Accuracy: 0.50
[[2104 2181]
 [1203 1255]]
Profit: 52208


np.float64(52207.96053987916)

# Calculate historical data

In [175]:
def score_losses(df):
    return int(df['avg_price_per_room'].sum() * df["booking_status"].mean())

losses = score_losses(pd.concat([X_test, y_test], axis=1))

usual_profit = int(X_test['avg_price_per_room'].sum()) - losses
print(f"Прибыль без овербукинга на test: {int(usual_profit)}")
print(f"Прибыль с овербукингом на test: {int(ml_profit)}")
print(f"Мы увеличили прибыль на {int(ml_profit - usual_profit)}, те на {round((ml_profit / usual_profit - 1) * 100, 2)}% ")

Прибыль без овербукинга на test: 444180
Прибыль с овербукингом на test: 523781
Мы увеличили прибыль на 79601, те на 17.92% 
