# 02 - Feature Engineering

Goal: create the clean training frame and feature set used across models.

In [None]:
import faulthandler
import sys
import time
from pathlib import Path

# Some notebook frontends provide a stderr object without a real file descriptor.
# faulthandler requires a real file-like object with fileno().
_debug_log = Path('..') / 'debug_faulthandler.log'

_fh_file = None
try:
    # Prefer the real underlying stderr if available
    _fh_file = getattr(sys, '__stderr__', sys.stderr)
    faulthandler.enable(file=_fh_file)
except Exception:
    _fh_file = open(_debug_log, 'w', encoding='utf-8')
    faulthandler.enable(file=_fh_file)

try:
    faulthandler.dump_traceback_later(10, repeat=True, file=_fh_file)
except TypeError:
    # Older Python versions don't accept 'file=' here; fall back.
    faulthandler.dump_traceback_later(10, repeat=True)

print("Debug watchdog enabled (stack traces every 10s).", flush=True)
print(f"If traces don't appear in the notebook, check: {_debug_log}", flush=True)


def _timed(label: str):
    t0 = time.time()
    print(label, flush=True)

    def _done(extra: str = ""):
        dt = time.time() - t0
        msg = f"{label} done in {dt:.2f}s" + (f" | {extra}" if extra else "")
        print(msg, flush=True)

    return _done


UnsupportedOperation: fileno

In [None]:
from pathlib import Path
import sys

sys.path.append(str(Path('..').resolve()))

import pandas as pd
from src.price_model import prepare_training_frame, DEFAULT_FEATURE_COLS

_done = _timed('STEP 1: read hotel_bookings.csv')
df = pd.read_csv('../data/hotel_bookings.csv')
_done(extra=f"shape={df.shape}")

_done = _timed('STEP 2: prepare_training_frame (filter + outlier cap)')
frame = prepare_training_frame(df)
_done(extra=f"shape={frame.shape}")

frame[DEFAULT_FEATURE_COLS + ['adr']].head()