In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from ollama import Client
import re
import langdetect

df = pd.read_csv("Hotel_Data\Hotel_Tripadvisor_Data_SEA_cleaned.csv")
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['Hotel Class'] = pd.to_numeric(df['Hotel Class'], errors='coerce')
df['airport_distance_miles'] = pd.to_numeric(df['airport_distance_miles'], errors='coerce')
df['Overall Review Count'] = pd.to_numeric(df['Overall Review Count'], errors='coerce')

In [2]:
def safe(value, default="Kh√¥ng r√µ"):
    if pd.isna(value) or value in [-1, -1.0, "[]", ""]:
        return default
    return value

def hotel_to_text(row):
    return f"""
Kh√°ch s·∫°n: {safe(row['Name'])} ({safe(row['Hotel Class'], 'Kh√¥ng x√°c ƒë·ªãnh')} sao) - {safe(row['Price'])} USD
ƒê·ªãa ƒëi·ªÉm: {safe(row['Tourist Attraction'])}, {safe(row['Country'])}
ƒê·ªãa ch·ªâ: {safe(row['Address'])}
G·∫ßn s√¢n bay: {safe(row['Airport Names'])} - C√°ch {safe(row['airport_distance_miles'])} d·∫∑m
ƒê√°nh gi√°: {safe(row['Overall Rating Value'])} - {safe(row['Overall Rating Text'])} ({safe(row['Overall Review Count'])} l∆∞·ª£t)
Phong c√°ch: {safe(row['Hotel Style List'])} | Lo·∫°i ph√≤ng: {safe(row['Room Types List'])}
Ti·ªán nghi: {safe(row['Room Features List'])}
Website: {row['have_website']} | M√¥ t·∫£: {row['have_description']} | SƒêT: {row['have_phone_number']}
Nh√† h√†ng g·∫ßn: {row['Near Restaurants']} | ƒêi·ªÉm tham quan g·∫ßn: {row['Near Attractions']}
"""

In [3]:
def parse_query_to_filters(query):
    filters = {}
    lang = langdetect.detect(query)

    if lang == 'vi':
        if match := re.search(r"(?:d∆∞·ªõi|t·ªëi ƒëa|kh√¥ng qu√°)\s*(\d+[\.\d]*)", query):
            filters["price_max"] = float(match.group(1).replace(".", "").replace(",", ""))
        if match := re.search(r"(?:tr√™n|h∆°n|√≠t nh·∫•t)\s*(\d+[\.\d]*)", query):
            filters["price_min"] = float(match.group(1).replace(".", "").replace(",", ""))
        if match := re.search(r"(\d)\s*sao", query):
            filters["stars"] = int(match.group(1))
        if match := re.search(r"(?:tr√™n|h∆°n|√≠t nh·∫•t)\s*(\d+)\s*(?:ƒë√°nh gi√°|reviews)", query):
            filters["review_min"] = int(match.group(1))
        if match := re.search(r"s√¢n bay.*?(\d+(\.\d+)?)\s*(km|d·∫∑m|miles)?", query):
            filters["airport_max"] = float(match.group(1))
    else:
        if match := re.search(r"(?:under|max)\s*(\d+[\.\d]*)", query):
            filters["price_max"] = float(match.group(1).replace(",", ""))
        if match := re.search(r"(?:over|min|at least)\s*(\d+[\.\d]*)", query):
            filters["price_min"] = float(match.group(1).replace(",", ""))
        if match := re.search(r"(\d)\s*star", query):
            filters["stars"] = int(match.group(1))
        if match := re.search(r"(?:over|min|at least)\s*(\d+)\s*(?:reviews|ratings)", query):
            filters["review_min"] = int(match.group(1))
        if match := re.search(r"airport.*?(\d+(\.\d+)?)\s*(km|mile|miles)?", query):
            filters["airport_max"] = float(match.group(1))

    known_places = (
        df['Country'].dropna().unique().tolist() +
        df['Tourist Attraction'].dropna().unique().tolist()
    )
    for place in known_places:
        if place and place.lower() in query.lower():
            filters["location"] = place
            break


    if "website" in query or "c√≥ website" in query:
        filters["have_website"] = True
    if "m√¥ t·∫£" in query or "description" in query:
        filters["have_description"] = True
    if "sƒët" in query or "phone" in query:
        filters["have_phone"] = True

    return filters

In [None]:
import unicodedata

def normalize_text(text):
    if pd.isna(text):
        return ""
    text = unicodedata.normalize("NFKD", str(text)).encode("ascii", "ignore").decode("utf-8")
    return text.lower()

def apply_filters(df, filters):
    if filters.get("stars"):
        df = df[df["Hotel Class"] == filters["stars"]]
    if filters.get("price_max"):
        df = df[df["Price"] <= filters["price_max"]]
    if filters.get("price_min"):
        df = df[df["Price"] >= filters["price_min"]]
    if filters.get("location"):
        norm_loc = normalize_text(filters["location"])

        df = df[
            df["Tourist Attraction"].fillna("").apply(normalize_text).str.contains(norm_loc) |
            df["Country"].fillna("").apply(normalize_text).str.contains(norm_loc)
        ]
    if filters.get("review_min"):
        df = df[df["Overall Review Count"] >= filters["review_min"]]
    if filters.get("airport_max"):
        df = df[df["airport_distance_miles"] <= filters["airport_max"]]
    if filters.get("have_website"):
        df = df[df["have_website"] == True]
    if filters.get("have_description"):
        df = df[df["have_description"] == True]
    if filters.get("have_phone"):
        df = df[df["have_phone_number"] == True]
    return df


In [5]:
def generate_answer(query, context):
    prompt = f"""You are a smart travel assistant. Based on the hotel information below, list the relevant hotels that match the query.

Hotel information:
{context}

Question: {query}
Answer:
"""
    client = Client(host='http://localhost:11434')
    response = client.chat(
        model='mistral',
        messages=[{"role": "user", "content": prompt}]
    )
    return response['message']['content']

def get_all_answers(query):
    filters = parse_query_to_filters(query)
    filtered_df = apply_filters(df.copy(), filters)

    if filtered_df.empty:
        return "No matching hotels found."

    documents = filtered_df.apply(hotel_to_text, axis=1).tolist()
    chunks = [documents[i:i+10] for i in range(0, len(documents), 10)]

    results = []
    client = Client(host='http://localhost:11434')

    for idx, chunk in enumerate(chunks):
        context = "\n".join(chunk)
        prompt = f"""You are a travel assistant. Given the list of hotels below, summarize all relevant hotels that match the user query.

User query: {query}

Hotel list:
{context}

Answer in bullet points:
"""
        print(f"üåÄ Generating for chunk {idx + 1}/{len(chunks)}...")
        response = client.chat(model='mistral', messages=[{"role": "user", "content": prompt}])
        results.append(response["message"]["content"])

    return "\n\n".join(results)


In [None]:
def RAG_pipeline(query):
    print("üßæ C√¢u h·ªèi:", query)
    print("\nüìò Tr·∫£ l·ªùi:")
    print(get_all_answers(query))

In [7]:
RAG_pipeline("Hotels in Ha Giang")


üßæ C√¢u h·ªèi: Hotels in Ha Giang

üìò Tr·∫£ l·ªùi:
üåÄ Generating for chunk 1/58...
üåÄ Generating for chunk 2/58...
üåÄ Generating for chunk 3/58...
üåÄ Generating for chunk 4/58...
üåÄ Generating for chunk 5/58...
üåÄ Generating for chunk 6/58...
üåÄ Generating for chunk 7/58...
üåÄ Generating for chunk 8/58...
üåÄ Generating for chunk 9/58...
üåÄ Generating for chunk 10/58...
üåÄ Generating for chunk 11/58...
üåÄ Generating for chunk 12/58...
üåÄ Generating for chunk 13/58...
üåÄ Generating for chunk 14/58...
üåÄ Generating for chunk 15/58...
üåÄ Generating for chunk 16/58...
üåÄ Generating for chunk 17/58...


KeyboardInterrupt: 

In [1]:
# RAG + QA Pipeline for Hotel Data using HuggingFace Embedding

import pandas as pd
import ast
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import Ollama

# === STEP 1: Load and preprocess data ===
df = pd.read_csv("Hotel_Data/Hotel_Tripadvisor_Data_SEA_cleaned.csv")
df = df.fillna("")

# === STEP 2: Convert each hotel row into a Document ===
def row_to_doc(row):
    content = f"""
    Hotel Name: {row['Name']}
    Location: {row['Tourist Attraction']}, {row['Country']}
    Price: ${row['Price']}
    Rating: {row['Overall Rating Value']} ({row['Overall Rating Text']})
    Total Reviews: {row['Overall Review Count']}
    Excellent: {row['Excellent Reviews']}, Very Good: {row['Very Good Reviews']}, Average: {row['Average Reviews']}, Poor: {row['Poor Reviews']}, Terrible: {row['Terrible Reviews']}
    Hotel Class: {row['Hotel Class']}
    Languages: {row['Languages Spoken']}
    Style: {row['Hotel Style List']}
    Room Features: {row['Room Features List']}
    Room Types: {row['Room Types List']}
    Category: {row['category']}
    Airport Distance (miles): {row['airport_distance_miles']}
    """
    return Document(page_content=content.strip(), metadata={"hotel_id": row["Hotel ID"]})

hotel_docs = [row_to_doc(row) for _, row in df.iterrows()]

# === STEP 3: Embedding & vector store ===
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(hotel_docs, embedding)

# === STEP 4: Question Answering with Ollama ===
llm = Ollama(model="llama3")
retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k": 10})
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")

# === STEP 5: Ask questions ===
query = "Kh√°ch s·∫°n n√†o ·ªü H√† Giang c√≥ gi√° d∆∞·ªõi 20 ƒë√¥ v√† ƒë√°nh gi√° tr√™n 4.5?"
response = qa.run(query)
print("\n[Answer]\n", response)


  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  llm = Ollama(model="llama3")
  response = qa.run(query)


OllamaEndpointNotFoundError: Ollama call failed with status code 404. Maybe your model is not found and you should pull the model with `ollama pull llama3`.