In [8]:
# =========================================
# Part 2: spaCy NLP
# =========================================

# !pip install -q spacy pandas
# If running the first time, also download a model:
# !python -m spacy download en_core_web_sm

import spacy
import pandas as pd
from spacy.matcher import Matcher
from spacy.tokens import Span

# -------------------------
# 1) Load spaCy pipeline
# -------------------------
# Small English model is fine for tutorial. If you have vectors, you can try en_core_web_md.
nlp = spacy.load("en_core_web_sm")

In [9]:
# -------------------------
# 2) Sample travel/hostel data
# -------------------------
data = [
    {
        "id": 1,
        "text": "I'm selling lemons for $5 today. Also, I need a dorm bed in Tokyo for 3 nights near Shibuya."
    },
    {
        "id": 2,
        "text": "Please book a private room at Sakura Hostel Asakusa from Aug 21–24, budget under 8000 JPY per night."
    },
    {
        "id": 3,
        "text": "Looking for a cheap hostel in Barcelona, close to La Rambla. Arrival on 10/02, leaving on 10/06."
    },
    {
        "id": 4,
        "text": "Can you recommend a capsule hotel in Osaka? Ideally near Namba station and under $40."
    },
    {
        "id": 5,
        "text": "We want to stay in Kyoto during Golden Week. 2 adults, 1 child, total budget about 60,000 JPY."
    }
]
df = pd.DataFrame(data)


In [10]:
# -------------------------
# 3) Helpers: rule-based extensions
# -------------------------
matcher = Matcher(nlp.vocab)

# Price pattern: money symbol/amount + currency (optional)
price_pattern = [
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["jpy", "yen"]}}],
    [{"ORTH": "$"}, {"LIKE_NUM": True}],
    [{"LIKE_NUM": True}, {"ORTH": "$"}],              # rare format
    [{"LIKE_NUM": True}, {"LOWER": "usd"}],
    [{"LOWER": {"IN": ["under", "below", "<"]}}, {"ORTH": "$"}, {"LIKE_NUM": True}],
    [{"LOWER": {"IN": ["under", "below"]}}, {"LIKE_NUM": True}, {"LOWER": {"IN": ["jpy", "yen"]}}]
]
matcher.add("PRICE", price_pattern)

# Nights length pattern: number + nights/night
nights_pattern = [[{"LIKE_NUM": True}, {"LOWER": {"IN": ["night", "nights"]}}]]
matcher.add("NIGHTS", nights_pattern)

# Room type pattern: private room/dorm/capsule
room_pattern = [
    [{"LOWER": "private"}, {"LOWER": "room"}],
    [{"LOWER": "dorm"}, {"LOWER": {"IN": ["bed", "room"]}}],
    [{"LOWER": "capsule"}, {"LOWER": "hotel"}]
]
matcher.add("ROOMTYPE", room_pattern)

In [5]:
# -------------------------
# 4) Pipeline: process one doc
# -------------------------
def analyze_request(text: str) -> dict:
    doc = nlp(text)

    # Tokens & lemmas (with stopword flags)
    tokens = [t.text for t in doc]
    lemmas = [t.lemma_ for t in doc]
    stop_flags = [t.is_stop for t in doc]
    pos_tags = [f"{t.text}/{t.pos_}" for t in doc]

    # Sentence segmentation
    sents = [s.text for s in doc.sents]

    # Noun chunks (good for quick subject/object)
    chunks = [chunk.text for chunk in doc.noun_chunks]

    # Named Entities (LOC, GPE, MONEY, DATE, ORG, etc.)
    ents = [f"{ent.text}<{ent.label_}>" for ent in doc.ents]

    # Rule-based matches (price/nights/roomtype)
    matches = matcher(doc)
    prices = []
    nights = []
    roomtypes = []
    for mid, start, end in matches:
        label = nlp.vocab.strings[mid]
        span = doc[start:end].text
        if label == "PRICE":
            prices.append(span)
        elif label == "NIGHTS":
            nights.append(span)
        elif label == "ROOMTYPE":
            roomtypes.append(span)

    # Simple “extracted intent”
    intent = []
    if any(w.lower_ in {"book", "reserve"} for w in doc):
        intent.append("booking_intent")
    if any(w.lower_ in {"recommend"} for w in doc):
        intent.append("recommendation_intent")
    if "budget" in [t.lemma_ for t in doc] or prices:
        intent.append("budget_constraint")

    return {
        "text": text,
        "sentences": sents,
        "tokens": tokens,
        "lemmas": lemmas,
        "stopword_flags": stop_flags,
        "pos_tags": pos_tags,
        "noun_chunks": chunks,
        "entities": ents,
        "prices": list(dict.fromkeys(prices)),
        "nights": list(dict.fromkeys(nights)),
        "room_types": list(dict.fromkeys(roomtypes)),
        "intent_flags": list(dict.fromkeys(intent))
    }

In [11]:
# -------------------------
# 5) Apply to DataFrame
# -------------------------
results = df["text"].apply(analyze_request)
out = pd.DataFrame(results.tolist()).join(df[["id"]])
# Reorder columns for readability
out = out[["id", "text", "sentences", "tokens", "lemmas", "stopword_flags",
           "pos_tags", "noun_chunks", "entities", "prices", "nights",
           "room_types", "intent_flags"]]

# Preview
pd.set_option("display.max_colwidth", 150)
print(out.to_string(index=False))

 id                                                                                                 text                                                                                              sentences                                                                                                                      tokens                                                                                                                      lemmas                                                                                                                                                   stopword_flags                                                                                                                                                                                                                                      pos_tags                                                                 noun_chunks                                                                      

In [12]:
# -------------------------
# 6) (Optional) Rule-based label for destinations
# -------------------------
# If you want a custom entity label for Destinations using EntityRuler
from spacy.pipeline import EntityRuler
ruler = nlp.add_pipe("entity_ruler", before="ner")
patterns = [
    {"label": "DESTINATION", "pattern": "Tokyo"},
    {"label": "DESTINATION", "pattern": "Osaka"},
    {"label": "DESTINATION", "pattern": "Kyoto"},
    {"label": "DESTINATION", "pattern": "Barcelona"},
    {"label": "DESTINATION", "pattern": "Shibuya"},
    {"label": "DESTINATION", "pattern": "Namba"},
    {"label": "DESTINATION", "pattern": "La Rambla"},
]
ruler.add_patterns(patterns)

# Re-run on one sample to show DESTINATION tags appear in ents
sample_doc = nlp(df.loc[1, "text"])
print("With EntityRuler:", [(ent.text, ent.label_) for ent in sample_doc.ents])

With EntityRuler: [('Sakura Hostel Asakusa', 'FAC'), ('under 8000', 'DATE')]
