In [53]:
pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [9]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize

In [22]:
df = pd.read_csv('dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HotelID       3896 non-null   int64  
 1   Name Hotel    3896 non-null   object 
 2   Address       3896 non-null   object 
 3   Location      3896 non-null   object 
 4   Descriptions  3896 non-null   object 
 5   Rating        3896 non-null   float64
 6   IsEnglish     3896 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 186.6+ KB


In [4]:
df['Location'].unique()

array(['Ph√∫ Qu·ªëc', 'TP. H·ªì Ch√≠ Minh', 'ƒê√† N·∫µng', 'H√† N·ªôi', 'Nha Trang',
       'H·ªôi An', 'ƒê√† L·∫°t', 'Sa Pa', 'Hu·∫ø', 'V≈©ng T√†u'], dtype=object)

In [5]:
import re

def get_special_chars(df):
    # G·ªôp t·∫•t c·∫£ text trong dataframe (ch·ªâ c√°c c·ªôt ki·ªÉu object ho·∫∑c string)
    text_data = ' '.join(df.select_dtypes(include=['object']).astype(str).values.flatten())
    
    # T√¨m t·∫•t c·∫£ k√Ω t·ª± kh√¥ng ph·∫£i ch·ªØ, s·ªë ho·∫∑c kho·∫£ng tr·∫Øng
    special_chars = re.findall(r'[^\w\s]', text_data)
    
    # L·∫•y unique v√† s·∫Øp x·∫øp
    unique_chars = sorted(set(special_chars))
    
    print(f"üîé C√≥ {len(unique_chars)} k√Ω t·ª± ƒë·∫∑c bi·ªát kh√°c nhau trong d·ªØ li·ªáu:\n")
    print(' '.join(unique_chars))
    return unique_chars


In [6]:
specials = get_special_chars(df)

üîé C√≥ 37 k√Ω t·ª± ƒë·∫∑c bi·ªát kh√°c nhau trong d·ªØ li·ªáu:

! " # % & ' ( ) * + , - . / : ; @ [ ] ` | ~ ¬≠ ÃÄ ÃÅ ÃÇ ÃÉ Ãâ Ãõ Ã£ ‚Äã ‚Äì ‚Äô ‚Äú ‚Äù ‚òÜ ‚ú©


In [10]:
import re
import unicodedata

In [8]:
def clean_vietnamese_text(text):
    if not isinstance(text, str):
        return ""
        
    text = unicodedata.normalize('NFC', text)
    
    text = text.lower()

    text = text.replace('%', ' ph·∫ßn trƒÉm')
    
    text = re.sub(r'/\s*ƒë√™m', ' m·ªói ƒë√™m', text, flags=re.IGNORECASE)
    text = re.sub(r"[^\w\s\-,.]", " ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [9]:
text = "xin ch√†O"
print(clean_vietnamese_text(text))

xin ch√†o


# Embedding v·ªõi Sentence-BERT

# Load Sentence-BERT

In [12]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
# sbert_model = SentenceTransformer('keepitreal/vietnamese-sbert')

# Gh√©p c√°c tr∆∞·ªùng l·∫°i th√†nh 1 text ƒë·ªÉ embedding

In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HotelID       3896 non-null   int64  
 1   Name Hotel    3896 non-null   object 
 2   Address       3896 non-null   object 
 3   Location      3896 non-null   object 
 4   Descriptions  3896 non-null   object 
 5   Rating        3896 non-null   float64
 6   IsEnglish     3896 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 186.6+ KB
None


In [12]:
df.isnull().sum()

HotelID         0
Name Hotel      0
Address         0
Location        0
Descriptions    0
Rating          0
IsEnglish       0
dtype: int64

In [13]:
df["TextForEmbedding"] = df["Descriptions"].apply(clean_vietnamese_text)

# Sinh embedding cho to√†n b·ªô dataset

In [14]:
embeddings = sbert_model.encode(df["TextForEmbedding"].tolist(), batch_size=32, show_progress_bar=True)
embeddings = normalize(embeddings, axis=1)  # chu·∫©n h√≥a theo h√†ng

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

In [15]:
print("Embedding shape:", embeddings.shape)

Embedding shape: (3896, 768)


In [16]:
print("‚úÖ Embedding done!")
print("Ki·ªÉu d·ªØ li·ªáu:", type(embeddings))     # numpy.ndarray
print("Shape:", embeddings.shape)            # (s·ªë_samples, s·ªë_chi·ªÅu) v√≠ d·ª• (1000, 384)
print("Dtype:", embeddings.dtype)            # float32 ho·∫∑c float64

‚úÖ Embedding done!
Ki·ªÉu d·ªØ li·ªáu: <class 'numpy.ndarray'>
Shape: (3896, 768)
Dtype: float32


In [17]:
norms = np.linalg.norm(embeddings, axis=1)
print(norms[:10])

[1.0000001  1.         0.99999994 1.0000001  1.0000001  1.
 1.         0.99999994 1.         1.        ]


# L∆∞u v√†o Milvus

In [3]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection, utility, connections

# K·∫øt n·ªëi ƒë·∫øn Milvus server

In [14]:
connections.connect(alias="default",host="localhost",port="19530")
collection_name = "hotels_collection_mpnet_base_v2"

# T·∫°o collection schema

In [20]:
# Ki·ªÉm tra xem collection ƒë√£ t·ªìn t·∫°i ch∆∞a
if utility.has_collection(collection_name):
    print(f"‚ö†Ô∏è Collection '{collection_name}' ƒë√£ t·ªìn t·∫°i, x√≥a v√† t·∫°o m·ªõi l·∫°i...")
    utility.drop_collection(collection_name)

# T·∫°o m·ªõi collection
print(f"üöÄ T·∫°o m·ªõi collection '{collection_name}'...")

fields = [
    FieldSchema(name="HotelID", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="TextForEmbedding", dtype=DataType.FLOAT_VECTOR, dim=768),  # Sentence-BERT = 384, PhoBERT = 768
    FieldSchema(name="Location", dtype=DataType.VARCHAR, max_length=100), 
]

schema = CollectionSchema(fields=fields, description="Hotel dataset embeddings")

collection = Collection(name=collection_name, schema=schema)
print(f"‚úÖ Collection '{collection_name}' ƒë√£ ƒë∆∞·ª£c t·∫°o m·ªõi th√†nh c√¥ng!")

üöÄ T·∫°o m·ªõi collection 'hotels_collection_mpnet_base_v2'...
‚úÖ Collection 'hotels_collection_mpnet_base_v2' ƒë√£ ƒë∆∞·ª£c t·∫°o m·ªõi th√†nh c√¥ng!


# Chu·∫©n b·ªã d·ªØ li·ªáu ƒë·ªÉ insert

In [21]:
hotel_ids = df["HotelID"].astype(int).tolist()
locations = df["Location"].astype(str).str.strip().tolist()

In [22]:
print(type(embeddings))
print(type(hotel_ids))
print(type(locations))

<class 'numpy.ndarray'>
<class 'list'>
<class 'list'>


# Insert

In [23]:
collection.insert([
        hotel_ids,
        embeddings.tolist(),
        locations,
    ])

collection.flush()
print(f"üìä S·ªë l∆∞·ª£ng b·∫£n ghi: {collection.num_entities}")

üìä S·ªë l∆∞·ª£ng b·∫£n ghi: 3896


In [24]:
index_params = {
    "index_type": "HNSW",
    "metric_type": "COSINE",
    "params": {"M": 8, "efConstruction": 64}
}
# T·∫°o index cho field vector
collection.release()
collection.drop_index()
collection.create_index(field_name="TextForEmbedding", index_params=index_params)

Status(code=0, message=)

In [15]:
collection = Collection(collection_name)
collection.load()

# demo

In [16]:
def clean_text_for_query(text):
    if not isinstance(text, str):
        return ""

    text = unicodedata.normalize('NFC', text)
    text = text.lower()

    text = re.sub(r"[^\w\s/\-]", " ", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

In [17]:
from underthesea import ner

def detect_city(query):
    cities = {
        "h·ªì ch√≠ minh": "H·ªì Ch√≠ Minh",
        "tp h·ªì ch√≠ minh": "H·ªì Ch√≠ Minh",
        "tp.hcm": "H·ªì Ch√≠ Minh",
        "s√†i g√≤n": "H·ªì Ch√≠ Minh",
        "h√† n·ªôi": "H√† N·ªôi",
        "ƒë√† n·∫µng": "ƒê√† N·∫µng",
        "ph√∫ qu·ªëc": "Ph√∫ Qu·ªëc",
        "nha trang": "Nha Trang",
        "h·ªôi an": "H·ªôi An",
        "ƒë√† l·∫°t": "ƒê√† L·∫°t",
        "sa pa": "Sa Pa",
        "sapa": "Sa Pa",
        "hu·∫ø": "Hu·∫ø",
        "v≈©ng t√†u": "V≈©ng T√†u"
    }
    query_lower = query.lower()
    print(ner(query))
    # Rule-based
    for k, v in cities.items():
        if k in query_lower:
            return v
    
    # NER-based
    for word, _, _, tag in ner(query):
        if tag.endswith("LOC"):
            print(f'ƒê·ªãa danh nh·∫≠n d·∫°ng: {word}')
            return word.title()
    
    return None

def prepare_query(query):
    city = detect_city(query)
    if city:
        semantic_query = query.lower().replace(city.lower(), "").strip()
        expr = f'Location like "%{city}%"'
    else:
        semantic_query = query
        expr = ""
    
    return semantic_query, expr

In [21]:
query = "homestay c√≥ h·ªì b∆°i g·∫ßn bi·ªÉn C√† Mau"

semantic_query, expr = prepare_query(query)

semantic_query = clean_text_for_query(semantic_query)

query_emb = sbert_model.encode([query], normalize_embeddings=True)

search_params = {
    "metric_type": "COSINE",   # d√πng ƒë·ªô t∆∞∆°ng ƒë·ªìng cosine
    "params": {"M": 8, "efConstruction": 64}
}

results = collection.search(
    data=query_emb,          # vector c·ªßa query
    anns_field="TextForEmbedding",     # t√™n field ch·ª©a vector trong Milvus
    param=search_params,
    limit=10,                    # tr·∫£ v·ªÅ top 5 k·∫øt qu·∫£ g·∫ßn nh·∫•t
    expr=expr,
    output_fields=["HotelID"]
)

result_data = []

for i, hits in enumerate(results):
    for hit in hits:
        result_data.append({
            "HotelID": hit.entity.get("HotelID"),
            "Similarity": hit.distance
        })

print("\nüìÑ Th√¥ng tin kh√°ch s·∫°n t∆∞∆°ng ·ª©ng:\n")
print(f"Query: {query}\n")
for item in result_data:
    hotel_id = item["HotelID"]
    similarity = item["Similarity"]

    row = df[df["HotelID"] == hotel_id]

    if not row.empty:
        name = row.iloc[0].get("Name Hotel", "Kh√¥ng c√≥ t√™n")
        description = row.iloc[0].get("Descriptions", "Kh√¥ng c√≥ m√¥ t·∫£")
        print(f"üè® {name} (ID: {hotel_id}, Similarity: {similarity:.4f})")
        print(f"   üëâ {description}\n")
    else:
        print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y HotelID {hotel_id} trong CSV\n")

[('homestay', 'N', 'B-NP', 'O'), ('c√≥', 'V', 'B-VP', 'O'), ('h·ªì', 'N', 'B-NP', 'O'), ('b∆°i', 'V', 'B-VP', 'O'), ('g·∫ßn', 'A', 'B-AP', 'O'), ('bi·ªÉn', 'N', 'B-NP', 'O'), ('C√† Mau', 'Np', 'B-NP', 'B-LOC')]
ƒê·ªãa danh nh·∫≠n d·∫°ng: C√† Mau

üìÑ Th√¥ng tin kh√°ch s·∫°n t∆∞∆°ng ·ª©ng:

Query: homestay c√≥ h·ªì b∆°i g·∫ßn bi·ªÉn C√† Mau



In [9]:
üìÑ Th√¥ng tin kh√°ch s·∫°n t∆∞∆°ng ·ª©ng:

Query: homestay c√≥ h·ªì b∆°i g·∫ßn bi·ªÉn ·ªü Ph√∫ Qu·ªëc

üè® Homestead Seaview Ph√∫ Qu·ªëc Hotel (ID: 251, Similarity: 0.7658)
   üëâ N·∫±m tr√™n ƒë·∫£o Ph√∫ Qu·ªëc, Homestead Seaview Ph√∫ Qu·ªëc Hotel c√≥ nh√† h√†ng, h·ªì b∆°i ngo√†i tr·ªùi, s·∫£nh kh√°ch chung v√† khu v∆∞·ªùn. Ch·ªó ngh·ªâ n√†y c≈©ng c√≥ c√°c ph√≤ng gia ƒë√¨nh v√† s√¢n hi√™n. Ch·ªó ngh·ªâ cung c·∫•p d·ªãch v·ª• l·ªÖ t√¢n 24 gi·ªù, m√°y ATM v√† d·ªãch v·ª• thu ƒë·ªïi ngo·∫°i t·ªá cho kh√°ch. Ph√≤ng ngh·ªâ c·ªßa kh√°ch s·∫°n ƒë∆∞·ª£c trang b·ªã m√°y ƒëi·ªÅu h√≤a, TV m√†n h√¨nh ph·∫≥ng v·ªõi truy·ªÅn h√¨nh v·ªá tinh, t·ªß l·∫°nh, ·∫•m ƒëun n∆∞·ªõc, v√≤i sen, m√°y s·∫•y t√≥c v√† b√†n l√†m vi·ªác. CaÃÅc phoÃÄng coÃÄn coÃÅ t·ªß ƒë·ªÉ qu·∫ßn √°o vaÃÄ phoÃÄng tƒÉÃÅm ri√™ng. Homestead Seaview Ph√∫ Qu·ªëc Hotel ph·ª•c v·ª• b·ªØa s√°ng t·ª± ch·ªçn ho·∫∑c b·ªØa s√°ng √† la carte. ƒêi xe ƒë·∫°p l√† ho·∫°t ƒë·ªông ƒë∆∞·ª£c ∆∞a chu·ªông trong khu v·ª±c v√† du kh√°ch c≈©ng c√≥ th·ªÉ thu√™ xe h∆°i t·∫°i ch·ªó ngh·ªâ. Homestead Seaview Ph√∫ Qu·ªëc Hotel n·∫±m c√°ch Ch√πa S√πng H∆∞ng 4 km v√† S√≤ng b·∫°c Corona 26 km.

üè® HS Beach House Phu Quoc Island (ID: 3128, Similarity: 0.7649)
   üëâ T·ªça l·∫°c t·∫°i ƒë·∫£o Ph√∫ Qu·ªëc, c√°ch B√£i D√†i 1,8 km, HS Beach House Phu Quoc Island cung c·∫•p ch·ªó ngh·ªâ b√™n b·ªù bi·ªÉn v·ªõi nhi·ªÅu ti·ªán nghi nh∆∞ v∆∞·ªùn, khu v·ª±c b√£i bi·ªÉn ri√™ng v√† s√≤ng b·∫°c. Ch·ªó ngh·ªâ n√†y cung c·∫•p c√°c ph√≤ng gia ƒë√¨nh v√† ti·ªán nghi BBQ. Ch·ªó ngh·ªâ c√≥ b·∫øp chung, d·ªãch v·ª• ph√≤ng v√† t·ªï ch·ª©c c√°c tour du l·ªãch cho kh√°ch. M·ªói ph√≤ng ngh·ªâ t·∫°i nh√† kh√°ch ƒë·ªÅu c√≥ maÃÅy ƒëi·ªÅu h√≤a, b√†n l√†m vi·ªác, TV m√†n h√¨nh ph·∫≥ng, ph√≤ng t·∫Øm ri√™ng, ga tr·∫£i gi∆∞·ªùng, khƒÉn t·∫Øm v√† s√¢n trong nh√¨n ra bi·ªÉn. T·∫•t c·∫£ c√°c cƒÉn ƒë∆∞·ª£c b·ªë tr√≠ t·ªß ƒë·ªÉ qu·∫ßn √°o v√† ·∫•m ƒëun n∆∞·ªõc. HS Beach House Phu Quoc Island c√≥ s√¢n hi√™n. B√£i bi·ªÉn √îng Lang n·∫±m trong b√°n k√≠nh 3 km t·ª´ ch·ªó ngh·ªâ trong khi Ch√πa S√πng H∆∞ng c√°ch ƒë√≥ 2,6 km.

üè® Phu Nam House - Resort (ID: 1847, Similarity: 0.7600)
   üëâ "Phu Nam House - Resort" cung c·∫•p ch·ªó ngh·ªâ t·∫°i m·ªôt v·ªã tr√≠ tuy·ªát ƒë·∫πp ·ªü ƒë·∫£o Ph√∫ Qu·ªëc, c√°ch B√£i Sao v√† B√£i Khem m·ªôt qu√£ng ng·∫Øn. Ch·ªó ngh·ªâ n√†y c√≥ v∆∞·ªùn, t·∫ßm nh√¨n ra v∆∞·ªùn v√† WiFi mi·ªÖn ph√≠. Lodge n√†y n·∫±m c√°ch Ch√πa S√πng H∆∞ng 26 km v√† S√≤ng b·∫°c Corona 44 km.

üè® Mai Binh Phuong Bungalow (ID: 2100, Similarity: 0.7596)
   üëâ T·ªça l·∫°c t·∫°i ƒë·∫£o Ph√∫ Qu·ªëc, c√°ch B√£i D√†i 400 m, Mai Binh Phuong Bungalow cung c·∫•p ch·ªó ngh·ªâ v·ªõi h·ªì b∆°i ngo√†i tr·ªùi, ch·ªó ƒë·ªó xe ri√™ng mi·ªÖn ph√≠, v∆∞·ªùn v√† s√¢n hi√™n. Trong s·ªë c√°c ti·ªán nghi c·ªßa ch·ªó ngh·ªâ n√†y c√≥ d·ªãch v·ª• ti·ªÅn s·∫£nh, b√†n ƒë·∫∑t tour v√† WiFi mi·ªÖn ph√≠ trong to√†n b·ªô khu√¥n vi√™n. N∆°i ƒë√¢y cung c·∫•p d·ªãch v·ª• l·ªÖ t√¢n 24 gi·ªù, d·ªãch v·ª• ph√≤ng v√† d·ªãch v·ª• thu ƒë·ªïi ngo·∫°i t·ªá cho kh√°ch. T·∫•t c·∫£ ph√≤ng ngh·ªâ t·∫°i Mai Binh Phuong Bungalow ƒë∆∞·ª£c b·ªë tr√≠ b√†n l√†m vi·ªác, TV, maÃÅy ƒëi·ªÅu h√≤a v√† ph√≤ng t·∫Øm ri√™ng v·ªõi v√≤i sen c√πng d√©p. M·ªôt s·ªë ph√≤ng c√≥ s√¢n trong. M·ªói ph√≤ng ƒë·ªÅu ƒë∆∞·ª£c trang b·ªã ga tr·∫£i gi∆∞·ªùng v√† khƒÉn t·∫Øm. Mai Binh Phuong Bungalow ph·ª•c v·ª• b·ªØa s√°ng buffet h·∫±ng ng√†y. Du kh√°ch c√≥ th·ªÉ ch∆°i bida c≈©ng nh∆∞ thu√™ xe h∆°i t·∫°i nh√† kh√°ch. C√°c ƒëi·ªÉm tham quan n·ªïi ti·∫øng g·∫ßn Mai Binh Phuong Bungalow bao g·ªìm B√£i bi·ªÉn Dinh C·∫≠u, Ch√πa S√πng H∆∞ng v√† Ch·ª£ ƒë√™m Ph√∫ Qu·ªëc. S√¢n bay g·∫ßn nh·∫•t l√† s√¢n bay qu·ªëc t·∫ø Ph√∫ Qu·ªëc, c√°ch ƒë√≥ 9 km, v√† ch·ªó ngh·ªâ cung c·∫•p d·ªãch v·ª• ƒë∆∞a ƒë√≥n s√¢n bay v·ªõi m·ªôt kho·∫£n ph·ª• ph√≠.

üè® Mai Phuong Resort Phu Quoc (ID: 344, Similarity: 0.7584)
   üëâ Ch·ªó ngh·ªâ n√†y n·∫±m ngay tr√™n b√£i bi·ªÉn ri√™ng B√£i D√†i v√† c√≥ nh√† h√†ng Vi·ªát Nam c≈©ng nh∆∞ Wi-Fi mi·ªÖn ph√≠ trong to√†n b·ªô khu√¥n vi√™n. Mai Phuong Resort Phu Quoc n·∫±m c√°ch trung t√¢m th√†nh ph·ªë v√† s√¢n bay qu·ªëc t·∫ø Ph√∫ Qu·ªëc 25 km. ƒê∆∞·ª£c l√†m m√°t b·∫±ng m√°y ƒëi·ªÅu h√≤a v√† qu·∫°t m√°y, c√°c bungalow t·∫°i ƒë√¢y c√≥ TV truy·ªÅn h√¨nh c√°p, khu v·ª±c gh·∫ø ng·ªìi c≈©ng nh∆∞ ph√≤ng t·∫Øm ri√™ng ƒëi k√®m ti√™Ã£n nghi v√≤i sen, b·ªìn t·∫Øm, ƒë·ªì v·ªá sinh c√° nh√¢n vaÃÄ m√°y s·∫•y t√≥c. D·ªãch v·ª• ph√≤ng cuÃÉng ƒë∆∞·ª£c cung c·∫•p cho kh√°ch. Kh√°ch c√≥ th·ªÉ ƒë·∫øn qu·∫ßy l·ªÖ t√¢n 24 gi·ªù ƒë·ªÉ ƒë∆∞·ª£c h·ªó tr·ª£ v·ªõi c√°c d·ªãch v·ª• gi·∫∑t l√†, cho thu√™ ph∆∞∆°ng ti·ªán ƒëi l·∫°i v√† ƒë∆∞a ƒë√≥n s√¢n bay ho·∫∑c t·ªõi b√†n ƒë·∫∑t tour ƒë·ªÉ s·∫Øp x·∫øp vi·ªác ƒëi l·∫°i.


SyntaxError: invalid character 'üìÑ' (U+1F4C4) (1384534573.py, line 1)