In [16]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection
import pandas as pd
from underthesea import ner
import re
import unicodedata

# K·∫øt n·ªëi Milvus

In [3]:
connections.connect("default", host="localhost", port="19530")
collection = Collection("hotels_collection_mpnet_base_v2")

# Load d·ªØ li·ªáu

In [4]:
df = pd.read_csv("dataset.csv")
docs = df["Descriptions"].fillna("").tolist()

# BM25

In [5]:
tokenized_docs = [d.lower().split() for d in docs]
bm25 = BM25Okapi(tokenized_docs)

# SBERT

In [6]:
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

In [37]:
def detect_city(query):
    cities = {
        "h·ªì ch√≠ minh": "H·ªì Ch√≠ Minh",
        "tp h·ªì ch√≠ minh": "H·ªì Ch√≠ Minh",
        "tp.hcm": "H·ªì Ch√≠ Minh",
        "s√†i g√≤n": "H·ªì Ch√≠ Minh",
        "h√† n·ªôi": "H√† N·ªôi",
        "ƒë√† n·∫µng": "ƒê√† N·∫µng",
        "ph√∫ qu·ªëc": "Ph√∫ Qu·ªëc",
        "nha trang": "Nha Trang",
        "h·ªôi an": "H·ªôi An",
        "ƒë√† l·∫°t": "ƒê√† L·∫°t",
        "sa pa": "Sa Pa",
        "sapa": "Sa Pa",
        "hu·∫ø": "Hu·∫ø",
        "v≈©ng t√†u": "V≈©ng T√†u"
    }
    query_lower = query.lower()
    
    # Rule-based
    for k, v in cities.items():
        if k in query_lower:
            return v
    
    # NER-based
    for word, _, _, tag in ner(query):
        if tag.endswith("LOC"):
            print(f'ƒê·ªãa danh nh·∫≠n d·∫°ng: {word}')
            return word.title()
    
    return None

def prepare_query(query):
    city = detect_city(query)
    if city:
        semantic_query = query.lower().replace(city.lower(), "").strip()
        expr = f'Location like "%{city}%"'
    else:
        semantic_query = query
        expr = ""
    
    return semantic_query, expr

def clean_text_for_query(text):
    if not isinstance(text, str):
        return ""

    text = unicodedata.normalize('NFC', text)
    text = text.lower()

    text = re.sub(r"[^\w\s/\-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [27]:
def hybrid_search(query, alpha=0.6):
    tokenized_query = query.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)

    semantic_query, expr = prepare_query(query)
    semantic_query = clean_text_for_query(semantic_query)
    
    query_emb = model.encode([query], normalize_embeddings=True)
    search_params = {"metric_type": "COSINE", "params": {"M": 8, "efConstruction": 64}}

    results = collection.search(
        data=query_emb,
        anns_field="TextForEmbedding",
        param=search_params,
        limit=10,
        expr=expr,
        output_fields=["HotelID"]
    )

    milvus_hits = []
    for hits in results:
        for hit in hits:
            milvus_hits.append((hit.entity.get("HotelID"), hit.distance))
    milvus_df = pd.DataFrame(milvus_hits, columns=["HotelID", "semantic_score"])

    bm25_df = df[["HotelID"]].copy()
    bm25_df["bm25_score"] = bm25_scores

    merged = pd.merge(bm25_df, milvus_df, on="HotelID", how="inner")
    merged["final_score"] = alpha * merged["semantic_score"] + (1 - alpha) * merged["bm25_score"]
    merged = merged.sort_values(by="final_score", ascending=False).head(10)

    for _, r in merged.iterrows():
        hotel = df[df["HotelID"] == r["HotelID"]].iloc[0]
        print(f"üè® {hotel['Name Hotel']} (Score: {r['final_score']:.4f})")
        print(f"   {hotel['Descriptions']}\n")

# Test

In [36]:
query = "homestay c√≥ h·ªì b∆°i g·∫ßn bi·ªÉn ·ªü ƒê·∫°t"
hybrid_search(query)

ƒê·ªãa danh nh·∫≠n d·∫°ng: ƒê·∫°t
