# 05 - Review NLP (Hotel-Level Summaries)

Pipeline:
1) Load `reviews.csv` + `offerings.csv`
2) Merge (CRITICAL) so each review maps to a hotel
3) Clean review text
4) Sentiment analysis
5) Pros & Cons extraction
6) Aggregate at hotel level (API-ready output)

In [1]:
from __future__ import annotations

import ast
import re
from collections import Counter, defaultdict
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
DATA_DIR = Path('..') / 'data'
REVIEWS_CSV = DATA_DIR / 'reviews.csv'
OFFERINGS_CSV = DATA_DIR / 'offerings.csv'

REVIEWS_CSV.exists(), OFFERINGS_CSV.exists()

(True, True)

## Step 1: Load BOTH files (safe for large reviews.csv)
`reviews.csv` is large, so we stream it in chunks and aggregate incrementally.

In [3]:
offerings = pd.read_csv(OFFERINGS_CSV)
offerings.shape, offerings.columns.tolist()

((4333, 9),
 ['hotel_class',
  'region_id',
  'url',
  'phone',
  'details',
  'address',
  'type',
  'id',
  'name'])

In [4]:
# Minimal hotel info frame for merging
hotels = offerings[offerings['type'].eq('hotel')].copy()
hotels = hotels.rename(columns={'id': 'offering_id', 'name': 'hotel'})

def parse_locality(address_str: str) -> str | None:
    if pd.isna(address_str):
        return None
    try:
        d = ast.literal_eval(address_str)
        if isinstance(d, dict):
            return d.get('locality')
    except Exception:
        return None
    return None

hotels['city'] = hotels['address'].apply(parse_locality)
hotels = hotels[['offering_id', 'hotel', 'city', 'hotel_class', 'region_id', 'url']]
hotels.head()

Unnamed: 0,offering_id,hotel,city,hotel_class,region_id,url
0,113317,Casablanca Hotel Times Square,New York City,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...
1,76049,Four Seasons Hotel Los Angeles at Beverly Hills,Los Angeles,5.0,32655,http://www.tripadvisor.com/Hotel_Review-g32655...
2,99352,Hilton Garden Inn Times Square,New York City,3.5,60763,http://www.tripadvisor.com/Hotel_Review-g60763...
3,93589,The Michelangelo Hotel,New York City,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...
4,217616,The Muse Hotel New York,New York City,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...


## Step 2: Merge Reviews with Hotel Info (CRITICAL)
We merge on `reviews.offering_id == hotels.offering_id`.

## Step 3: Clean Review Text
We normalize whitespace, strip junk, and keep human-readable text for downstream sentiment + phrase extraction.

In [5]:
_WS_RE = re.compile(r'\s+')
_BAD_CHARS_RE = re.compile(r'[^\w\s\.,;:!\?"\'\-\(\)\/]+', re.UNICODE)


def clean_text(text: str | float) -> str:
    if text is None or (isinstance(text, float) and np.isnan(text)):
        return ''
    t = str(text)
    t = _BAD_CHARS_RE.sub(' ', t)
    t = _WS_RE.sub(' ', t).strip()
    return t


def tokenize(text: str) -> list[str]:
    t = clean_text(text).lower()
    toks = re.findall(r"[a-zA-Z]{2,}", t)
    return toks

## Step 4: Sentiment Analysis (Production-Grade)
Primary path: HuggingFace `transformers` sentiment pipeline (if installed).
Fallback path: lexicon-based scoring (so the notebook runs even without extra packages).

In [6]:
def try_load_transformer_sentiment():
    try:
        from transformers import pipeline

        return pipeline(
            'sentiment-analysis',
            model='distilbert-base-uncased-finetuned-sst-2-english',
            truncation=True,
        )
    except Exception:
        return None


_sent_pipe = try_load_transformer_sentiment()
_sent_pipe is not None





Device set to use cpu


True

In [7]:
_POS = {
    'clean','great','excellent','amazing','awesome','friendly','helpful','comfortable','perfect','nice','love',
    'convenient','recommend','best','wonderful','spacious','quiet','beautiful','fantastic','delightful',
}
_NEG = {
    'dirty','bad','terrible','awful','rude','noisy','slow','worst','poor','expensive','smell','broken',
    'uncomfortable','disappointed','horrible','problem','bugs','small','crowded','overpriced',
}


def _fallback_sentiment_score(text: str) -> float:
    t = clean_text(text)
    if not t:
        return 0.5

    toks = re.findall(r'[a-zA-Z]{2,}', t.lower())
    if not toks:
        return 0.5

    pos = sum(1 for w in toks if w in _POS)
    neg = sum(1 for w in toks if w in _NEG)
    raw = (pos - neg) / max(1, pos + neg)
    return float(np.clip(0.5 + 0.5 * raw, 0.0, 1.0))


def batch_sentiment_scores(texts: list[str], batch_size: int = 64) -> list[float]:
    if not texts:
        return []

    if _sent_pipe is None:
        return [_fallback_sentiment_score(t) for t in texts]

    scores: list[float] = []
    for i in range(0, len(texts), batch_size):
        batch = [t[:512] for t in texts[i : i + batch_size]]
        outs = _sent_pipe(batch)
        for out in outs:
            label = str(out.get('label', ''))
            s = float(out.get('score', 0.5))
            if label.upper().startswith('NEG'):
                s = 1.0 - s
            scores.append(float(s))

    return scores


batch_sentiment_scores(['Great clean room and friendly staff', 'Slow wifi and dirty bathroom'])


[0.9998711347579956, 0.0015205740928649902]

## Step 5: Extract Pros & Cons (Differentiator)
We split reviews into sentences, score each sentence, then extract frequent bi/tri-gram phrases from positive vs negative sentences.

In [8]:
_STOP = {
    'the','a','an','and','or','but','to','of','in','on','for','with','is','it','this','that','was','were',
    'are','be','as','at','by','from','we','i','you','they','our','my','your','their','me','us','him','her',
    'very','really','so','too','just','also','not','no','yes','can','could','would','should','will','have','had',
}

# Keyword dictionary for fast pros/cons without sentence-level BERT
PRO_KEYWORDS = {
    'clean': ['clean', 'spotless'],
    'staff': ['friendly', 'helpful', 'staff', 'service'],
    'location': ['location', 'close', 'convenient', 'walk'],
    'room': ['room', 'rooms', 'spacious', 'comfortable', 'bed', 'beds'],
    'view': ['view'],
    'food': ['breakfast', 'food', 'restaurant'],
    'wifi': ['wifi', 'wi-fi', 'internet'],
}
CON_KEYWORDS = {
    'wifi': ['wifi', 'wi-fi', 'internet', 'slow'],
    'noise': ['noisy', 'noise', 'loud'],
    'cleanliness': ['dirty', 'smell', 'bugs'],
    'price': ['expensive', 'overpriced'],
    'room': ['small', 'broken'],
}


def _contains_any(tokens_set: set[str], keywords: list[str]) -> bool:
    return any(k.replace('-', '').lower() in tokens_set or k.lower() in tokens_set for k in keywords)


def extract_keyword_phrases(text: str, sentiment: float, pos_th: float = 0.65, neg_th: float = 0.35) -> tuple[list[str], list[str]]:
    toks = tokenize(text)
    toks_set = set(toks)

    pros: list[str] = []
    cons: list[str] = []

    if sentiment >= pos_th:
        for label, kws in PRO_KEYWORDS.items():
            if _contains_any(toks_set, kws):
                # output a readable phrase (API-friendly)
                if label == 'staff':
                    pros.append('friendly staff')
                elif label == 'room':
                    pros.append('clean rooms')
                elif label == 'wifi':
                    pros.append('good wifi')
                else:
                    pros.append(label)

    if sentiment <= neg_th:
        for label, kws in CON_KEYWORDS.items():
            if _contains_any(toks_set, kws):
                if label == 'wifi':
                    cons.append('slow wifi')
                elif label == 'noise':
                    cons.append('noisy rooms')
                elif label == 'price':
                    cons.append('expensive')
                else:
                    cons.append(label)

    return pros, cons


## Step 6: Aggregate at HOTEL Level (API-ready)
We stream reviews in chunks and build per-hotel aggregates: avg rating, avg sentiment, pros/cons.

In [9]:
def parse_overall_rating(ratings_str: str | float) -> float | None:
    if ratings_str is None or (isinstance(ratings_str, float) and np.isnan(ratings_str)):
        return None
    try:
        d = ast.literal_eval(ratings_str)
        if isinstance(d, dict) and 'overall' in d:
            return float(d['overall'])
    except Exception:
        return None
    return None


@dataclass
class HotelAgg:
    rating_sum: float = 0.0
    rating_n: int = 0
    sent_sum: float = 0.0
    sent_n: int = 0
    pros: Counter = None
    cons: Counter = None

    def __post_init__(self):
        if self.pros is None:
            self.pros = Counter()
        if self.cons is None:
            self.cons = Counter()


def update_agg(agg: HotelAgg, review_text: str, overall_rating: float | None, sentiment: float):
    agg.sent_sum += float(sentiment)
    agg.sent_n += 1

    if overall_rating is not None:
        agg.rating_sum += float(overall_rating)
        agg.rating_n += 1

    pros, cons = extract_keyword_phrases(review_text, sentiment)
    agg.pros.update(pros)
    agg.cons.update(cons)


def top_k(counter: Counter, k: int = 6) -> list[str]:
    items = [p for p, _ in counter.most_common(k)]
    return items


In [10]:
import time

CHUNKSIZE = 25_000
MAX_REVIEWS = 50_000  # smaller sample for iteration; increase only for final offline run

CACHE_PATH = DATA_DIR / 'reviews_enriched_cached.csv.gz'

hotel_index = hotels.set_index('offering_id')[['hotel','city']].to_dict(orient='index')

use_cols = ['offering_id', 'ratings', 'title', 'text']

if not CACHE_PATH.exists():
    print(f'Cache not found. Building cache at: {CACHE_PATH}', flush=True)
    print(f'Using CHUNKSIZE={CHUNKSIZE:,} MAX_REVIEWS={MAX_REVIEWS:,}', flush=True)

    processed = 0
    out_parts = []
    chunk_i = 0
    t_start = time.time()

    for chunk in pd.read_csv(REVIEWS_CSV, usecols=use_cols, chunksize=CHUNKSIZE):
        chunk_i += 1
        t_chunk = time.time()
        raw_n = len(chunk)

        chunk = chunk[chunk['offering_id'].isin(hotel_index.keys())]
        kept_n = len(chunk)
        if kept_n == 0:
            print(f'chunk {chunk_i}: raw={raw_n:,} kept=0 (filtered) | processed={processed:,}', flush=True)
            continue

        chunk = chunk.merge(hotels, on='offering_id', how='left')
        chunk['review_text'] = (chunk['title'].fillna('') + ' ' + chunk['text'].fillna('')).map(clean_text)
        chunk['overall_rating'] = chunk['ratings'].map(parse_overall_rating)

        texts = chunk['review_text'].tolist()
        chunk['sentiment_score'] = batch_sentiment_scores(texts, batch_size=64)

        out_parts.append(chunk[['offering_id','hotel','city','overall_rating','sentiment_score','review_text']])

        processed += kept_n

        elapsed = time.time() - t_start
        rate = processed / max(1e-6, elapsed)
        remaining = max(0, MAX_REVIEWS - processed)
        eta_sec = remaining / max(1e-6, rate)
        chunk_sec = time.time() - t_chunk

        print(
            f"chunk {chunk_i}: raw={raw_n:,} kept={kept_n:,} | processed={processed:,}/{MAX_REVIEWS:,} "
            f"| chunk_time={chunk_sec:.1f}s | rate={rate:,.0f} reviews/s | ETA={eta_sec/60:.1f} min",
            flush=True,
        )

        if processed >= MAX_REVIEWS:
            print('Reached MAX_REVIEWS cap. Stopping early.', flush=True)
            break

    print('Concatenating cached dataset...', flush=True)
    cached = pd.concat(out_parts, ignore_index=True)
    print(f'Writing cache to {CACHE_PATH} (gzip)...', flush=True)
    cached.to_csv(CACHE_PATH, index=False, compression='gzip')
    print(f'Cache saved. cached.shape={cached.shape}', flush=True)

    cached.shape
else:
    print(f'Cache found: {CACHE_PATH}', flush=True)
    t0 = time.time()
    cached = pd.read_csv(CACHE_PATH)
    print(f'Loaded cache in {time.time()-t0:.2f}s | cached.shape={cached.shape}', flush=True)
    cached.shape


Cache not found. Building cache at: ..\data\reviews_enriched_cached.csv.gz
Using CHUNKSIZE=25,000 MAX_REVIEWS=50,000


chunk 1: raw=25,000 kept=25,000 | processed=25,000/50,000 | chunk_time=2352.6s | rate=11 reviews/s | ETA=39.2 min
chunk 2: raw=25,000 kept=25,000 | processed=50,000/50,000 | chunk_time=2176.2s | rate=11 reviews/s | ETA=0.0 min
Reached MAX_REVIEWS cap. Stopping early.
Concatenating cached dataset...
Writing cache to ..\data\reviews_enriched_cached.csv.gz (gzip)...
Cache saved. cached.shape=(50000, 6)


In [11]:
aggs: dict[int, HotelAgg] = {}

for row in cached.itertuples(index=False):
    oid = int(row.offering_id)
    agg = aggs.get(oid)
    if agg is None:
        agg = HotelAgg()
        aggs[oid] = agg

    update_agg(
        agg,
        review_text=str(row.review_text),
        overall_rating=None if pd.isna(row.overall_rating) else float(row.overall_rating),
        sentiment=float(row.sentiment_score),
    )

rows = []
for oid, agg in aggs.items():
    meta = hotel_index.get(oid, {})
    avg_rating = (agg.rating_sum / agg.rating_n) if agg.rating_n else None
    sent = (agg.sent_sum / agg.sent_n) if agg.sent_n else None
    rows.append({
        'offering_id': oid,
        'hotel': meta.get('hotel'),
        'city': meta.get('city'),
        'avg_rating': None if avg_rating is None else float(avg_rating),
        'sentiment_score': None if sent is None else float(sent),
        'pros': top_k(agg.pros, k=6),
        'cons': top_k(agg.cons, k=6),
        'n_reviews': int(agg.sent_n),
    })

hotel_summary = pd.DataFrame(rows)
hotel_summary = hotel_summary.sort_values(['n_reviews','sentiment_score'], ascending=[False, False])
hotel_summary.head(10)


Unnamed: 0,offering_id,hotel,city,avg_rating,sentiment_score,pros,cons,n_reviews
0,93338,Hotel Beacon,New York City,4.5024,0.872416,"[clean rooms, location, friendly staff, clean,...","[room, slow wifi, noisy rooms, cleanliness, ex...",1250
4,1456560,Eventi - a Kimpton Hotel,New York City,4.553327,0.846388,"[clean rooms, friendly staff, location, clean,...","[room, slow wifi, noisy rooms, expensive, clea...",1097
5,93396,The Iroquois,New York City,4.473983,0.83407,"[friendly staff, clean rooms, location, clean,...","[room, slow wifi, noisy rooms, expensive, clea...",1057
418,290978,Hotel St. James,New York City,3.474747,0.534504,"[clean rooms, location, clean, friendly staff,...","[slow wifi, room, noisy rooms, cleanliness, ex...",990
412,112066,W New York,New York City,3.760204,0.67985,"[clean rooms, friendly staff, location, clean,...","[room, noisy rooms, expensive, slow wifi, clea...",980
417,93517,Paramount Hotel Times Square New York,New York City,3.668041,0.676281,"[clean rooms, location, friendly staff, clean,...","[room, noisy rooms, slow wifi, cleanliness, ex...",970
415,93345,Skyline Hotel,New York City,3.603333,0.672377,"[clean rooms, location, friendly staff, clean,...","[noisy rooms, slow wifi, room, cleanliness, ex...",900
416,93333,Wolcott Hotel,New York City,3.584029,0.586469,"[clean rooms, location, friendly staff, clean,...","[slow wifi, room, noisy rooms, cleanliness, ex...",839
1,1762573,Andaz 5th Avenue,New York City,4.583673,0.80544,"[clean rooms, friendly staff, location, food, ...","[slow wifi, room, noisy rooms, expensive, clea...",735
432,93526,the Lexington,New York City,3.381974,0.607839,"[clean rooms, location, friendly staff, clean,...","[room, noisy rooms, cleanliness, slow wifi, ex...",699


In [12]:
example = hotel_summary.dropna(subset=['hotel']).iloc[0].to_dict()
example

{'offering_id': 93338,
 'hotel': 'Hotel Beacon',
 'city': 'New York City',
 'avg_rating': 4.5024,
 'sentiment_score': 0.8724156901836395,
 'pros': ['clean rooms',
  'location',
  'friendly staff',
  'clean',
  'food',
  'view'],
 'cons': ['room', 'slow wifi', 'noisy rooms', 'cleanliness', 'expensive'],
 'n_reviews': 1250}

In [None]:
# Persist hotel-level summaries for reuse in recommendation / API
SUMMARY_CSV = DATA_DIR / 'hotel_review_summaries.csv'
SUMMARY_JSON = DATA_DIR / 'hotel_review_summaries.json'

hotel_summary.to_csv(SUMMARY_CSV, index=False)
hotel_summary.to_json(SUMMARY_JSON, orient='records', indent=2)

str(SUMMARY_CSV), str(SUMMARY_JSON)