In [9]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize

In [22]:
df = pd.read_csv('dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HotelID       3896 non-null   int64  
 1   Name Hotel    3896 non-null   object 
 2   Address       3896 non-null   object 
 3   Location      3896 non-null   object 
 4   Descriptions  3896 non-null   object 
 5   Rating        3896 non-null   float64
 6   IsEnglish     3896 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 186.6+ KB


In [4]:
df['Location'].unique()

array(['Ph√∫ Qu·ªëc', 'TP. H·ªì Ch√≠ Minh', 'ƒê√† N·∫µng', 'H√† N·ªôi', 'Nha Trang',
       'H·ªôi An', 'ƒê√† L·∫°t', 'Sa Pa', 'Hu·∫ø', 'V≈©ng T√†u'], dtype=object)

In [5]:
import re

def get_special_chars(df):
    # G·ªôp t·∫•t c·∫£ text trong dataframe (ch·ªâ c√°c c·ªôt ki·ªÉu object ho·∫∑c string)
    text_data = ' '.join(df.select_dtypes(include=['object']).astype(str).values.flatten())
    
    # T√¨m t·∫•t c·∫£ k√Ω t·ª± kh√¥ng ph·∫£i ch·ªØ, s·ªë ho·∫∑c kho·∫£ng tr·∫Øng
    special_chars = re.findall(r'[^\w\s]', text_data)
    
    # L·∫•y unique v√† s·∫Øp x·∫øp
    unique_chars = sorted(set(special_chars))
    
    print(f"üîé C√≥ {len(unique_chars)} k√Ω t·ª± ƒë·∫∑c bi·ªát kh√°c nhau trong d·ªØ li·ªáu:\n")
    print(' '.join(unique_chars))
    return unique_chars


In [6]:
specials = get_special_chars(df)

üîé C√≥ 37 k√Ω t·ª± ƒë·∫∑c bi·ªát kh√°c nhau trong d·ªØ li·ªáu:

! " # % & ' ( ) * + , - . / : ; @ [ ] ` | ~ ¬≠ ÃÄ ÃÅ ÃÇ ÃÉ Ãâ Ãõ Ã£ ‚Äã ‚Äì ‚Äô ‚Äú ‚Äù ‚òÜ ‚ú©


In [10]:
import re
import unicodedata

In [8]:
def clean_vietnamese_text(text):
    if not isinstance(text, str):
        return ""
        
    text = unicodedata.normalize('NFC', text)
    
    text = text.lower()

    text = text.replace('%', ' ph·∫ßn trƒÉm')
    
    text = re.sub(r'/\s*ƒë√™m', ' m·ªói ƒë√™m', text, flags=re.IGNORECASE)
    text = re.sub(r"[^\w\s\-,.]", " ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [9]:
text = "xin ch√†O"
print(clean_vietnamese_text(text))

xin ch√†o


# Embedding v·ªõi Sentence-BERT

# Load Sentence-BERT

In [12]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
# sbert_model = SentenceTransformer('keepitreal/vietnamese-sbert')

# Gh√©p c√°c tr∆∞·ªùng l·∫°i th√†nh 1 text ƒë·ªÉ embedding

In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HotelID       3896 non-null   int64  
 1   Name Hotel    3896 non-null   object 
 2   Address       3896 non-null   object 
 3   Location      3896 non-null   object 
 4   Descriptions  3896 non-null   object 
 5   Rating        3896 non-null   float64
 6   IsEnglish     3896 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 186.6+ KB
None


In [12]:
df.isnull().sum()

HotelID         0
Name Hotel      0
Address         0
Location        0
Descriptions    0
Rating          0
IsEnglish       0
dtype: int64

In [13]:
df["TextForEmbedding"] = df["Descriptions"].apply(clean_vietnamese_text)

# Sinh embedding cho to√†n b·ªô dataset

In [14]:
embeddings = sbert_model.encode(df["TextForEmbedding"].tolist(), batch_size=32, show_progress_bar=True)
embeddings = normalize(embeddings, axis=1)  # chu·∫©n h√≥a theo h√†ng

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

In [15]:
print("Embedding shape:", embeddings.shape)

Embedding shape: (3896, 768)


In [16]:
print("‚úÖ Embedding done!")
print("Ki·ªÉu d·ªØ li·ªáu:", type(embeddings))     # numpy.ndarray
print("Shape:", embeddings.shape)            # (s·ªë_samples, s·ªë_chi·ªÅu) v√≠ d·ª• (1000, 384)
print("Dtype:", embeddings.dtype)            # float32 ho·∫∑c float64

‚úÖ Embedding done!
Ki·ªÉu d·ªØ li·ªáu: <class 'numpy.ndarray'>
Shape: (3896, 768)
Dtype: float32


In [17]:
norms = np.linalg.norm(embeddings, axis=1)
print(norms[:10])

[1.0000001  1.         0.99999994 1.0000001  1.0000001  1.
 1.         0.99999994 1.         1.        ]


# L∆∞u v√†o Milvus

In [3]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection, utility, connections

# K·∫øt n·ªëi ƒë·∫øn Milvus server

In [14]:
connections.connect(alias="default",host="localhost",port="19530")
collection_name = "hotels_collection_mpnet_base_v2"

# T·∫°o collection schema

In [20]:
# Ki·ªÉm tra xem collection ƒë√£ t·ªìn t·∫°i ch∆∞a
if utility.has_collection(collection_name):
    print(f"‚ö†Ô∏è Collection '{collection_name}' ƒë√£ t·ªìn t·∫°i, x√≥a v√† t·∫°o m·ªõi l·∫°i...")
    utility.drop_collection(collection_name)

# T·∫°o m·ªõi collection
print(f"üöÄ T·∫°o m·ªõi collection '{collection_name}'...")

fields = [
    FieldSchema(name="HotelID", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="TextForEmbedding", dtype=DataType.FLOAT_VECTOR, dim=768),  # Sentence-BERT = 384, PhoBERT = 768
    FieldSchema(name="Location", dtype=DataType.VARCHAR, max_length=100), 
]

schema = CollectionSchema(fields=fields, description="Hotel dataset embeddings")

collection = Collection(name=collection_name, schema=schema)
print(f"‚úÖ Collection '{collection_name}' ƒë√£ ƒë∆∞·ª£c t·∫°o m·ªõi th√†nh c√¥ng!")

üöÄ T·∫°o m·ªõi collection 'hotels_collection_mpnet_base_v2'...
‚úÖ Collection 'hotels_collection_mpnet_base_v2' ƒë√£ ƒë∆∞·ª£c t·∫°o m·ªõi th√†nh c√¥ng!


# Chu·∫©n b·ªã d·ªØ li·ªáu ƒë·ªÉ insert

In [21]:
hotel_ids = df["HotelID"].astype(int).tolist()
locations = df["Location"].astype(str).str.strip().tolist()

In [22]:
print(type(embeddings))
print(type(hotel_ids))
print(type(locations))

<class 'numpy.ndarray'>
<class 'list'>
<class 'list'>


# Insert

In [23]:
collection.insert([
        hotel_ids,
        embeddings.tolist(),
        locations,
    ])

collection.flush()
print(f"üìä S·ªë l∆∞·ª£ng b·∫£n ghi: {collection.num_entities}")

üìä S·ªë l∆∞·ª£ng b·∫£n ghi: 3896


# T·∫°o index

In [24]:
index_params = {
    "index_type": "HNSW",
    "metric_type": "COSINE",
    "params": {"M": 8, "efConstruction": 64}
}
# T·∫°o index cho field vector
collection.release()
collection.drop_index()
collection.create_index(field_name="TextForEmbedding", index_params=index_params)

Status(code=0, message=)