In [1]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection
import pandas as pd
from underthesea import ner
import re
import unicodedata

# K·∫øt n·ªëi Milvus

In [3]:
connections.connect("default", host="localhost", port="19530")
collection = Collection("hotels_collection_mpnet_base_v2")

# SBERT - BM25

In [4]:
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

In [5]:
def detect_city(query):
    cities = {
        "h·ªì ch√≠ minh": "H·ªì Ch√≠ Minh",
        "tp h·ªì ch√≠ minh": "H·ªì Ch√≠ Minh",
        "tp.hcm": "H·ªì Ch√≠ Minh",
        "s√†i g√≤n": "H·ªì Ch√≠ Minh",
        "h√† n·ªôi": "H√† N·ªôi",
        "ƒë√† n·∫µng": "ƒê√† N·∫µng",
        "ph√∫ qu·ªëc": "Ph√∫ Qu·ªëc",
        "nha trang": "Nha Trang",
        "h·ªôi an": "H·ªôi An",
        "ƒë√† l·∫°t": "ƒê√† L·∫°t",
        "sa pa": "Sa Pa",
        "sapa": "Sa Pa",
        "hu·∫ø": "Hu·∫ø",
        "v≈©ng t√†u": "V≈©ng T√†u"
    }
    query_lower = query.lower()
    
    # Rule-based
    for k, v in cities.items():
        if k in query_lower:
            return v
    
    # NER-based
    for word, _, _, tag in ner(query):
        if tag.endswith("LOC"):
            if word.title() in df["Location"].unique():
                print(f'ƒê·ªãa danh nh·∫≠n d·∫°ng: {word}')
                return word.title()
    
    return None

def clean_text_for_query(text):
    if not isinstance(text, str):
        return ""

    text = unicodedata.normalize('NFC', text)
    text = text.lower()

    text = re.sub(r"[^\w\s/\-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [6]:
def prepare_query(query):
    city = detect_city(query)
    tokenized_query = query.lower().split()
    expr = ""
    bm25_scores = []
    
    if city:
        expr = f'Location like "%{city}%"'
        
        results = collection.query(
            expr=expr,
            output_fields=["HotelID", "Description"]
        )

        if results:
            filtered_docs = [r['Description'] for r in results]
            tokenized_filtered = [d.lower().split() for d in filtered_docs]
            bm25_local = BM25Okapi(tokenized_filtered)
            bm25_scores = bm25_local.get_scores(tokenized_query)

             # Normalize an to√†n
            if bm25_scores.max() != bm25_scores.min():
                bm25_scores = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min())
            else:
                bm25_scores = [0] * len(bm25_scores)
        
    semantic_query = query.lower().replace(city.lower(), "").strip() if city else query
    return semantic_query, expr, bm25_scores

In [22]:
def hybrid_search(query, alpha=1, top_k=20):
    
    semantic_query, expr, bm25_scores = prepare_query(query)
    semantic_query = clean_text_for_query(semantic_query)
    
    query_emb = model.encode([semantic_query], normalize_embeddings=True)
    search_params = {"metric_type": "COSINE", "params": {"M": 8, "efConstruction": 64}}

    results = collection.search(
        data=query_emb,
        anns_field="TextForEmbedding",
        param=search_params,
        limit=top_k,
        expr=expr,
        output_fields=["HotelID", "Description", "NameHotel"]
    )

    milvus_hits = []
    for hits in results:
        for idx, hit in enumerate(hits):
            bm25_score = bm25_scores[idx] if idx < len(bm25_scores) else 0
            milvus_hits.append({
                "HotelID": hit.entity.get("HotelID"),
                "Name Hotel": hit.entity.get("NameHotel"),
                "Descriptions": hit.entity.get("Description"),
                "semantic_score": hit.distance,
                "bm25_score": bm25_score
            })
            

    for h in milvus_hits:
        h["final_score"] = alpha * h["semantic_score"] + (1 - alpha) * h["bm25_score"]

    # S·∫Øp x·∫øp theo final score
    milvus_hits = sorted(milvus_hits, key=lambda x: x["final_score"], reverse=True)[:top_k]
        
    ids = [str(h['HotelID']) for h in milvus_hits]
    print("üìã HotelID list:",",".join(ids))
    print("=" * 80)

    # In k·∫øt qu·∫£
    for h in milvus_hits:
        print(f"üè® {h['Name Hotel']} (HotelId: {h['HotelID']}) (Score: {h['final_score']:.4f}) (Semantic: {h['semantic_score']:.4f}) (BM25: {h['bm25_score']:.4f})")
        print(f"   {h['Descriptions']}\n")

# Test

In [23]:
# query = "Resort l√£ng m·∫°n cho c·∫∑p ƒë√¥i ·ªü ƒê√† N·∫µng"
# query = "Resort g·∫ßn bi·ªÉn y√™n tƒ©nh"
# query = "Kh√°ch s·∫°n cho gia ƒë√¨nh c√≥ tr·∫ª em"
query ="kh√°ch s·∫°n g·∫ßn bi·ªÉn ƒë√† n·∫µng"
hybrid_search(query)

üìã HotelID list: 342,1074,3496,3937,2969,570,549,3038,131,1151,474,1062,2735,1222,2559,1494,1362,1059,276,129
üè® CODI SEA Hotel & Travel (HotelId: 342) (Score: 0.8688) (Semantic: 0.8688) (BM25: 0.3825)
   T·ªça l·∫°c t·∫°i th√†nh ph·ªë ƒê√† N·∫µng, c√°ch B√£i bi·ªÉn M·ªπ Kh√™ 300 m, CODI SEA Hotel & Travel c√≥ d·ªãch v·ª• ti·ªÅn s·∫£nh, ph√≤ng ngh·ªâ kh√¥ng g√¢y d·ªã ·ª©ng, s·∫£nh kh√°ch chung, WiFi mi·ªÖn ph√≠ trong to√†n b·ªô khu√¥n vi√™n v√† khu v∆∞·ªùn. Kh√°ch s·∫°n 2 sao n√†y c√≥ d·ªãch v·ª• ph√≤ng v√† b√†n ƒë·∫∑t tour. N∆°i ƒë√¢y cung c·∫•p d·ªãch v·ª• l·ªÖ t√¢n 24 gi·ªù, b·∫øp chung v√† d·ªãch v·ª• thu ƒë·ªïi ngo·∫°i t·ªá cho kh√°ch. T·∫•t c·∫£ ph√≤ng ngh·ªâ t·∫°i CODI SEA Hotel & Travel rooms ƒë·ªÅu ƒë∆∞·ª£c trang b·ªã m√°y ƒëi·ªÅu h√≤a, TV truy·ªÅn h√¨nh v·ªá tinh m√†n h√¨nh ph·∫≥ng, t·ªß l·∫°nh, ·∫•m ƒëun n∆∞·ªõc, v√≤i x·ªãt/ch·∫≠u r·ª≠a v·ªá sinh, d√©p, b√†n l√†m vi·ªác, ph√≤ng t·∫Øm ri√™ng v√† t·ªß ƒë·ªÉ qu·∫ßn √°o. M·ªôt s·ªë ph√≤ng c√≥ s√¢n trong v√† t·∫ßm nh√¨n ra qu