# Price Timing Baseline (Hotel Bookings Dataset)

Hero feature: estimate whether prices (ADR) are likely to **increase or decrease** if the user waits to book.

Dataset: https://raw.githubusercontent.com/mpolinowski/hotel-booking-dataset/refs/heads/master/datasets/hotel_bookings.csv


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor


In [None]:
#DATA_URL = 'https://raw.githubusercontent.com/mpolinowski/hotel-booking-dataset/refs/heads/master/datasets/hotel_bookings.csv'
DATA_PATH = "../hotel_bookings.csv"
df = pd.read_csv(DATA_PATH)
df.shape, df.columns.tolist()[:10]

: 

In [None]:
# Target: adr (Average Daily Rate)
# We train a model to predict ADR given trip/context features, including lead_time.
# Later we simulate 'wait 7 days' by decreasing lead_time and comparing predicted ADR.

df = df.copy()

# Basic cleanup
df = df[df['adr'].notna()]
df = df[df['adr'] >= 0]

# Filter out extreme ADR outliers to stabilize baseline
q99 = df['adr'].quantile(0.99)
df = df[df['adr'] <= q99]

df[['adr','lead_time']].describe()

In [None]:
# Feature set - chosen to keep the baseline strong and realistic
feature_cols = [
    'hotel',
    'lead_time',
    'arrival_date_year',
    'arrival_date_month',
    'arrival_date_week_number',
    'arrival_date_day_of_month',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'adults', 'children', 'babies',
    'meal',
    'country',
    'market_segment',
    'distribution_channel',
    'is_repeated_guest',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'reserved_room_type',
    'assigned_room_type',
    'booking_changes',
    'deposit_type',
    'agent',
    'company',
    'days_in_waiting_list',
    'customer_type',
    'required_car_parking_spaces',
    'total_of_special_requests',
    'reservation_status_date',
]

target_col = 'adr'

X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

In [None]:
# Identify categorical vs numeric columns
categorical_cols = [c for c in feature_cols if X_train[c].dtype == 'object']
numeric_cols = [c for c in feature_cols if c not in categorical_cols]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols),
    ]
)

model = HistGradientBoostingRegressor(random_state=42)

pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', model),
])

pipe

In [None]:
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, pred)
mae

In [None]:
def predict_adr(sample: dict) -> float:
    row = pd.DataFrame([sample])[feature_cols]
    return float(pipe.predict(row)[0])

def price_timing_signal(sample: dict, wait_days: int = 7):
    # In this dataset, waiting reduces lead_time.
    # Compare predicted ADR now vs after waiting wait_days.
    now = sample.copy()
    later = sample.copy()

    lt = int(sample.get('lead_time', 0) or 0)
    later['lead_time'] = max(0, lt - wait_days)

    adr_now = predict_adr(now)
    adr_later = predict_adr(later)

    delta = adr_later - adr_now
    pct = (delta / max(1e-6, adr_now)) * 100

    if pct > 2:
        decision = 'BOOK_NOW'
    elif pct < -2:
        decision = 'WAIT'
    else:
        decision = 'STABLE'

    return {
        'adr_now': adr_now,
        'adr_if_wait': adr_later,
        'delta': delta,
        'pct_change': pct,
        'decision': decision,
        'wait_days': wait_days,
        'lead_time_now': lt,
        'lead_time_if_wait': later['lead_time'],
    }


In [None]:
# Example: pick a real row to ensure valid categorical values
example = X_test.sample(1, random_state=42).iloc[0].to_dict()
price_timing_signal(example, wait_days=7)