# 🏨 Persona-Aware Hotel Discovery
This notebook recommends Booking.com listings for a selected persona (Family / Remote Worker / Tourist).
It ranks listings using student model predictions and displays key Booking metadata.

How to use:

- Choose Trip type (mandatory).

- Optionally type Country and/or City (leave empty for broader results).

- Choose Top-K and run the next cell.

Score meaning  
**Match score** is the predicted probability for the selected persona (higher = better fit).

In [0]:
from pyspark.sql import functions as F
import re

PRED_COMBINED_PATH = "dbfs:/tmp/booking_stage5/predictions_v1/pred_combined_best"
BOOKING_CLEAN_PATH = "dbfs:/tmp/booking_clean/booking_clean.parquet"

dbutils.widgets.text("country", "", "Country (optional)")
dbutils.widgets.text("city", "", "City (optional)")

dbutils.widgets.dropdown(
    "trip_type",
    "remote",
    ["family", "tourist", "remote"],
    "Trip type (required)"
)

dbutils.widgets.dropdown("top_k", "10", ["5","10","20","50"], "Top K")
def require_cols(df, required, name):
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise Exception(f"[{name}] Missing required columns: {missing}\nAvailable: {df.columns}")

def norm_col(col):
    # lowercase + trim + remove punctuation-like chars (keeps letters/numbers/spaces)
    return F.lower(F.trim(F.regexp_replace(col, r"[^a-zA-Z0-9\s]", " ")))

def norm_text(s: str) -> str:
    if s is None:
        return ""
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def has_rows(df_):
    return df_.limit(1).count() > 0


pred = spark.read.parquet(PRED_COMBINED_PATH)
hotels = spark.read.parquet(BOOKING_CLEAN_PATH)

require_cols(pred,   ["hotel_id","p_family","p_remote","p_tourist"], "predictions")
require_cols(hotels, ["hotel_id","title","url","city","country","review_score","number_of_reviews"], "booking_clean")

hotels_norm = (
    hotels
    .withColumnRenamed("title", "hotel_name")
    .withColumn("city_norm",    norm_col(F.col("city")))
    .withColumn("country_norm", norm_col(F.col("country")))
)

df = pred.join(hotels_norm, on="hotel_id", how="inner")

# -------------------------
# Inputs
# -------------------------
country_in = norm_text(dbutils.widgets.get("country"))
city_in    = norm_text(dbutils.widgets.get("city"))
trip       = dbutils.widgets.get("trip_type")  # widget makes it mandatory
top_k      = min(int(dbutils.widgets.get("top_k")), 50)

score_col = {"family":"p_family", "tourist":"p_tourist", "remote":"p_remote"}[trip]

# -------------------------
# Filtering logic (4 cases + fallbacks)
# -------------------------
warning = None

# Case 1: no country, no city
if country_in == "" and city_in == "":
    filtered = df

# Case 2: country only
elif country_in != "" and city_in == "":
    filtered_country = df.filter(F.col("country_norm").contains(F.lit(country_in)))
    if has_rows(filtered_country):
        filtered = filtered_country
    else:
        filtered = df
        warning = "No matches for country input; showing global top results."

# Case 3: city only
elif country_in == "" and city_in != "":
    filtered_city = df.filter(F.col("city_norm").contains(F.lit(city_in)))
    if has_rows(filtered_city):
        filtered = filtered_city
    else:
        filtered = df
        warning = "No matches for city input; showing global top results."

# Case 4: both country and city
else:
    filtered_country = df.filter(F.col("country_norm").contains(F.lit(country_in)))
    filtered_both = filtered_country.filter(F.col("city_norm").contains(F.lit(city_in)))

    if has_rows(filtered_both):
        filtered = filtered_both
    else:
        # drop country (unreliable), keep city only
        filtered_city = df.filter(F.col("city_norm").contains(F.lit(city_in)))
        if has_rows(filtered_city):
            filtered = filtered_city
            warning = "No matches for country+city; tried city-only; showing city-only results."
        else:
            filtered = df
            warning = "No matches for country+city; tried city-only; showing global top results."

# -------------------------
# Rank and select
# -------------------------
result = (
    filtered
    .withColumn("score", F.col(score_col))
    .withColumn("score_pct", F.round(F.col("score") * 100, 1))
    .withColumn("title_present", F.when(F.col("hotel_name").isNotNull() & (F.length(F.trim(F.col("hotel_name"))) > 0), F.lit(1)).otherwise(F.lit(0)))
    .withColumn("reviews_present", F.when(F.col("number_of_reviews").isNotNull() & (F.col("number_of_reviews") > 0), F.lit(1)).otherwise(F.lit(0)))
    .orderBy(
        F.desc("score"),
        F.desc("title_present"),
        F.desc("reviews_present"),
        F.desc(F.coalesce(F.col("number_of_reviews"), F.lit(0))),
        F.desc(F.coalesce(F.col("review_score"), F.lit(0.0)))
    )
    .limit(top_k)
)

# -------------------------
# Display: table
# -------------------------
display_cols = [c for c in [
    "hotel_id", "hotel_name", "url", "city", "country",
    "review_score", "number_of_reviews",
    "score"
] if c in result.columns]


# -------------------------
# Display: cards (user-friendly)
# -------------------------
rows = result.select(
    "hotel_id","hotel_name","url","city","country",
    "review_score","number_of_reviews","score_pct"
).toPandas().to_dict("records")

def fmt_str(x):
    if x is None:
        return ""
    x = str(x).strip()
    return "" if x.lower() == "nan" else x

def fmt_num(x):
    if x is None:
        return "—"
    try:
        if str(x).lower() == "nan":
            return "—"
    except:
        pass
    return x

cards = []
for r in rows:
    hid   = fmt_str(r.get("hotel_id",""))
    name  = fmt_str(r.get("hotel_name",""))
    url   = fmt_str(r.get("url","#"))
    city_ = fmt_str(r.get("city",""))
    ctry  = fmt_str(r.get("country",""))

    if name == "":
        name = f"Listing {hid}" if hid else "Listing (unknown id)"

    loc = ", ".join([x for x in [city_, ctry] if x != ""])
    if loc == "":
        loc = "Location unknown"

    rs = fmt_num(r.get("review_score"))
    nr = fmt_num(r.get("number_of_reviews"))
    sc = r.get("score_pct")

    cards.append(f"""
    <div style="border:1px solid #ddd;border-radius:14px;padding:14px;margin:10px 0;">
      <div style="font-size:18px;font-weight:700;margin-bottom:6px;">
        <a href="{url}" target="_blank" style="text-decoration:none;">{name}</a>
      </div>
      <div style="color:#444;margin-bottom:6px;">{city_}, {ctry}</div>
      <div style="display:flex;gap:14px;flex-wrap:wrap;color:#222;">
        <div><b>Match score:</b> {sc}%</div>
        <div><b>Review score:</b> {rs}</div>
        <div><b>#Reviews:</b> {nr}</div>
      </div>
    </div>
    """)

subtitle = f"{trip.title()} • Top {top_k}"
if city_in: subtitle += f" • City contains: {city_in}"
if country_in: subtitle += f" • Country contains: {country_in}"

warn_html = f"""
<div style="background:#fff3cd;border:1px solid #ffeeba;color:#856404;padding:10px;border-radius:10px;margin-bottom:12px;">
  <b>Note:</b> {warning}
</div>
""" if warning else ""

html = f"""
<div style="max-width:900px;margin:auto;font-family:Arial, sans-serif;">
  <h2>Hotel recommendations</h2>
  <div style="color:#666;margin-bottom:12px;">{subtitle}</div>
  {warn_html}
  {''.join(cards) if cards else "<div>No results.</div>"}
</div>
"""
displayHTML(html)
