# 🏨 Persona-Aware Hotel Discovery (Final Project)

This interactive tool recommends **Booking.com** listings tailored to specific traveler personas (**Family**, **Remote Worker**, or **Tourist**).

### 🚀 How to Use
1.  **Authentication**: ⚠️ **IMPORTANT  Don't forget to put the SAS token in the designated place!** You must paste the current SAS token provided by the course staff into the `SAS_TOKEN` variable in the code cell below.
2.  **Configure Preferences**: Use the **Widgets** at the top of the page to select your **Trip Type** (mandatory) and optional **Location** filters. 
3.  **Run All**: Click **"Run All"** at the top right of the Databricks interface. The notebook is designed to run end-to-end in a matter of minutes.
4.  **Explore Results**: The model ranks listings based on the highest **Match Score** (predicted probability) for your selected persona. 


In [0]:
from pyspark.sql import functions as F
import re

# --- 1. CONFIGURATION & AZURE CONNECTION ---
STORAGE_ACCOUNT = "lab94290"
CONTAINER = "submissions"
GROUP_FOLDER = "diyar_aleen_muhammad"

# Use the token provided
SAS_TOKEN = "USE THE TOKEN HERE"

if SAS_TOKEN.strip() in ["USE THE TOKEN HERE", "", "<INSERT_SAS_TOKEN_HERE>"]:
    raise ValueError("Missing SAS token. Paste the course-provided SAS token into SAS_TOKEN.")


# Set Spark configs for SAS authentication
spark.conf.set(f"fs.azure.account.auth.type.{STORAGE_ACCOUNT}.dfs.core.windows.net", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{STORAGE_ACCOUNT}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{STORAGE_ACCOUNT}.dfs.core.windows.net", SAS_TOKEN)

# Final path using ABFSS protocol
SAMPLE_PATH = f"abfss://{CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net/{GROUP_FOLDER}/ui_sample_hotels.csv"

# --- 2. WIDGETS & HELPERS ---
dbutils.widgets.text("country", "", "Country (optional)")
dbutils.widgets.text("city", "", "City (optional)")
dbutils.widgets.dropdown("trip_type", "remote", ["family", "tourist", "remote"], "Trip type (required)")
dbutils.widgets.dropdown("top_k", "10", ["5","10","20","50"], "Top K")

def require_cols(df, required, name):
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise Exception(f"[{name}] Missing required columns: {missing}\nAvailable: {df.columns}")

def norm_col(col):
    return F.lower(F.trim(F.regexp_replace(col, r"[^a-zA-Z0-9\s]", " ")))

def norm_text(s: str) -> str:
    if s is None: return ""
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def has_rows(df_):
    return df_.limit(1).count() > 0

# --- 3. DATA LOADING ---
# Added header=True and inferSchema=True to ensure column names and scores load correctly
df0 = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(SAMPLE_PATH)

require_cols(
    df0,
    ["hotel_id","hotel_name","url","city","country","review_score","number_of_reviews","p_family","p_remote","p_tourist"],
    "ui_sample"
)

df = (
    df0
    .withColumn("city_norm",    norm_col(F.col("city")))
    .withColumn("country_norm", norm_col(F.col("country")))
)

# --- 4. FILTERING LOGIC ---
country_in = norm_text(dbutils.widgets.get("country"))
city_in    = norm_text(dbutils.widgets.get("city"))
trip       = dbutils.widgets.get("trip_type")
top_k      = min(int(dbutils.widgets.get("top_k")), 50)

score_col = {"family":"p_family", "tourist":"p_tourist", "remote":"p_remote"}[trip]
warning = None

if country_in == "" and city_in == "":
    filtered = df
elif country_in != "" and city_in == "":
    filtered_country = df.filter(F.col("country_norm").contains(F.lit(country_in)))
    filtered = filtered_country if has_rows(filtered_country) else df
    if not has_rows(filtered_country):
        warning = "No matches for country input; showing global top results."
elif country_in == "" and city_in != "":
    filtered_city = df.filter(F.col("city_norm").contains(F.lit(city_in)))
    filtered = filtered_city if has_rows(filtered_city) else df
    if not has_rows(filtered_city):
        warning = "No matches for city input; showing global top results."
else:
    filtered_country = df.filter(F.col("country_norm").contains(F.lit(country_in)))
    filtered_both = filtered_country.filter(F.col("city_norm").contains(F.lit(city_in)))
    if has_rows(filtered_both):
        filtered = filtered_both
    else:
        filtered_city = df.filter(F.col("city_norm").contains(F.lit(city_in)))
        if has_rows(filtered_city):
            filtered = filtered_city
            warning = "No matches for country+city; tried city-only; showing city-only results."
        else:
            filtered = df
            warning = "No matches for country+city; showing global top results."

# --- 5. RANKING & RENDERING ---
result = (
    filtered
    .withColumn("score", F.col(score_col))
    .withColumn("score_pct", F.round(F.col("score") * 100, 1))
    .orderBy(
        F.desc("score"),
        F.desc(F.coalesce(F.col("number_of_reviews"), F.lit(0))),
        F.desc(F.coalesce(F.col("review_score"), F.lit(0.0)))
    )
    .limit(top_k)
)

rows = result.select(
    "hotel_id","hotel_name","url","city","country","review_score","number_of_reviews","score_pct"
).toPandas().to_dict("records")

def fmt_str(x):
    if x is None: return ""
    x = str(x).strip()
    return "" if x.lower() == "nan" else x

def fmt_num(x):
    if x is None: return "—"
    try:
        if str(x).lower() == "nan": return "—"
    except: pass
    return x

cards = []
for r in rows:
    hid   = fmt_str(r.get("hotel_id",""))
    name  = fmt_str(r.get("hotel_name",""))
    url   = fmt_str(r.get("url","#"))
    city_ = fmt_str(r.get("city",""))
    ctry  = fmt_str(r.get("country",""))

    if name == "":
        name = f"Listing {hid}" if hid else "Listing (unknown id)"

    rs = fmt_num(r.get("review_score"))
    nr = fmt_num(r.get("number_of_reviews"))
    sc = r.get("score_pct")

    cards.append(f"""
    <div style="border:1px solid #ddd;border-radius:14px;padding:14px;margin:10px 0;">
      <div style="font-size:18px;font-weight:700;margin-bottom:6px;">
        <a href="{url}" target="_blank" style="text-decoration:none;">{name}</a>
      </div>
      <div style="color:#444;margin-bottom:6px;">{city_}, {ctry}</div>
      <div style="display:flex;gap:14px;flex-wrap:wrap;color:#222;">
        <div><b>Match score:</b> {sc}%</div>
        <div><b>Review score:</b> {rs}</div>
        <div><b>#Reviews:</b> {nr}</div>
      </div>
    </div>
    """)

subtitle = f"{trip.title()} • Top {top_k}"
if city_in: subtitle += f" • City: {city_in}"
if country_in: subtitle += f" • Country: {country_in}"

warn_html = f'<div style="background:#fff3cd;border:1px solid #ffeeba;color:#856404;padding:10px;border-radius:10px;margin-bottom:12px;"><b>Note:</b> {warning}</div>' if warning else ""

html = f"""
<div style="max-width:900px;margin:auto;font-family:Arial, sans-serif;">
  <h2>Hotel Recommendations</h2>
  <div style="color:#666;margin-bottom:12px;">{subtitle}</div>
  {warn_html}
  {''.join(cards) if cards else "<div>No results.</div>"}
</div>
"""
displayHTML(html)