# 03 - Recommendation (Baseline)

Goal: a simple baseline recommender we can later evolve into personalization.

In [None]:
from pathlib import Path
import sys
import ast

sys.path.append(str(Path('..').resolve()))

import numpy as np
import pandas as pd

from src.recommender import RecommendationRequest, recommend_explainable

# --- Step 1: Build a "derived price" index from hotel_bookings.csv (no live pricing) ---
# NOTE: hotel_bookings.csv doesn't contain a 'city' column, so we build a robust global baseline from ADR,
# then refine using offerings city + hotel_class as multipliers.
bookings = pd.read_csv('../data/hotel_bookings.csv')
bookings = bookings[bookings['adr'].notna() & (bookings['adr'] >= 0)].copy()
base_adr_global = float(bookings['adr'].median())

# --- Step 2: Map to offerings (TripAdvisor hotels have offering_id + city/locality) ---
offerings = pd.read_csv('../data/offerings.csv')
hotels = offerings[offerings['type'].eq('hotel')].copy().rename(columns={'id': 'offering_id', 'name': 'hotel'})


def parse_locality(address_str: str) -> str | None:
    if pd.isna(address_str):
        return None
    try:
        d = ast.literal_eval(address_str)
        if isinstance(d, dict):
            return d.get('locality')
    except Exception:
        return None
    return None


hotels['city'] = hotels['address'].apply(parse_locality)
hotels['hotel_class'] = pd.to_numeric(hotels['hotel_class'], errors='coerce')

# --- Step 3: City + Class price bands (estimated) ---
class_base = 3.5
hotels['_class_mult'] = (hotels['hotel_class'].fillna(class_base) / class_base).clip(0.6, 2.0)

city_class_mean = hotels.groupby('city')['hotel_class'].mean()
global_class_mean = float(hotels['hotel_class'].mean()) if hotels['hotel_class'].notna().any() else class_base
hotels['_city_mult'] = hotels['city'].map(city_class_mean / max(1e-6, global_class_mean)).fillna(1.0).clip(0.7, 1.6)

hotels['adr_est'] = (base_adr_global * hotels['_class_mult'] * hotels['_city_mult']).astype(float)

# Conservative price band around the estimate (so filtering is safer than a single point)
hotels['adr_low'] = (hotels['adr_est'] * 0.85).round(2)
hotels['adr_high'] = (hotels['adr_est'] * 1.20).round(2)

# Confidence flag: high only when we know city + hotel_class; otherwise it's an estimate
hotels['price_confidence'] = np.where(
    hotels['city'].notna() & hotels['hotel_class'].notna(),
    'medium',
    'low',
)

# Keep a single display price too
hotels['adr'] = hotels['adr_est'].round(2)

# Review summaries come from 05_review_nlp.ipynb export
review_summary = pd.read_csv('../data/hotel_review_summaries.csv')

# User payload: hard filters (city, budget, min_rating)
# IMPORTANT: city must match the 'locality' values in offerings.csv (e.g., 'New York City')
req = RecommendationRequest(city='New York City', budget=200, min_rating=4.0)

# Candidate pool is filtered FIRST (hard filter) inside recommend_explainable
ranked = recommend_explainable(
    candidates=hotels[['offering_id', 'hotel', 'city', 'hotel_class', 'region_id', 'url', 'adr', 'adr_low', 'adr_high', 'price_confidence']],
    review_summary=review_summary,
    req=req,
    limit=10,
)

ranked[['offering_id','hotel','city','hotel_class','adr','adr_low','adr_high','price_confidence','sentiment_score','avg_rating','n_reviews','score','reason','pros','cons']].head(10)

Unnamed: 0,offering_id,hotel,city,hotel_class,adr,adr_low,adr_high,price_confidence,sentiment_score,avg_rating,n_reviews,score,reason,pros,cons
0,93338,Hotel Beacon,New York City,3.0,98.0,83.3,117.6,medium,0.872416,4.5024,1250.0,0.941821,Recommended because 87% positive reviews • rat...,"['clean rooms', 'location', 'friendly staff', ...","['room', 'slow wifi', 'noisy rooms', 'cleanlin..."
1,1456560,Eventi - a Kimpton Hotel,New York City,4.0,130.67,111.07,156.8,medium,0.846388,4.553327,1097.0,0.933306,Recommended because 85% positive reviews • rat...,"['clean rooms', 'friendly staff', 'location', ...","['room', 'slow wifi', 'noisy rooms', 'expensiv..."
2,208454,Sofitel New York,New York City,4.5,147.0,124.95,176.4,medium,0.897708,4.541463,410.0,0.927559,Recommended because 90% positive reviews • rat...,"['clean rooms', 'friendly staff', 'location', ...","['room', 'expensive', 'noisy rooms', 'slow wif..."
3,93396,The Iroquois,New York City,4.0,130.67,111.07,156.8,medium,0.83407,4.473983,1057.0,0.925656,Recommended because 83% positive reviews • rat...,"['friendly staff', 'clean rooms', 'location', ...","['room', 'slow wifi', 'noisy rooms', 'expensiv..."
4,93467,Affinia Gardens,New York City,3.5,114.33,97.18,137.2,medium,0.872796,4.46,400.0,0.916309,Recommended because 87% positive reviews • rat...,"['clean rooms', 'friendly staff', 'location', ...","['slow wifi', 'room', 'noisy rooms', 'cleanlin..."
5,1762573,Andaz 5th Avenue,New York City,4.0,130.67,111.07,156.8,medium,0.80544,4.583673,735.0,0.913822,Recommended because 81% positive reviews • rat...,"['clean rooms', 'friendly staff', 'location', ...","['slow wifi', 'room', 'noisy rooms', 'expensiv..."
6,578305,Residence Inn by Marriott Times Square New York,New York City,3.0,98.0,83.3,117.6,medium,0.883983,4.390909,330.0,0.912866,Recommended because 88% positive reviews • rat...,"['clean rooms', 'location', 'food', 'friendly ...","['room', 'slow wifi', 'noisy rooms', 'cleanlin..."
7,93559,The Sherry-Netherland Hotel,New York City,4.5,147.0,124.95,176.4,medium,0.901024,4.652893,121.0,0.907465,Recommended because 90% positive reviews • rat...,"['friendly staff', 'clean rooms', 'location', ...","['expensive', 'slow wifi']"
8,1025779,Candlewood Suites New York City Times Square,New York City,3.0,98.0,83.3,117.6,medium,0.853289,4.32,400.0,0.904857,Recommended because 85% positive reviews • rat...,"['clean rooms', 'friendly staff', 'location', ...","['room', 'slow wifi', 'noisy rooms', 'cleanlin..."
9,99352,Hilton Garden Inn Times Square,New York City,3.5,114.33,97.18,137.2,medium,0.969643,4.625,40.0,0.904,Recommended because 97% positive reviews • rat...,"['friendly staff', 'location', 'clean rooms', ...",[]
