In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3889 entries, 0 to 3888
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HotelID       3889 non-null   int64  
 1   Name Hotel    3889 non-null   object 
 2   Location      3889 non-null   object 
 3   Descriptions  3889 non-null   object 
 4   Rating        3889 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 152.0+ KB


In [4]:
df['Location'].unique()

array(['Ph√∫ Qu·ªëc', 'TP. H·ªì Ch√≠ Minh', 'ƒê√† N·∫µng', 'H√† N·ªôi', 'Nha Trang',
       'H·ªôi An', 'ƒê√† L·∫°t', 'Sa Pa', 'Hu·∫ø', 'V≈©ng T√†u'], dtype=object)

In [5]:
duplicates = df[df.duplicated(subset="HotelID", keep=False)]
print(duplicates)

      HotelID         Name Hotel         Location  \
18         21         Ruby Hotel        Nha Trang   
19         21         Ruby Hotel          ƒê√† N·∫µng   
123       140   The Sun Homestay              Hu·∫ø   
124       140   The Sun Homestay           H·ªôi An   
391       448        Ohana Hotel           H√† N·ªôi   
392       448        Ohana Hotel  TP. H·ªì Ch√≠ Minh   
640       736           An Hotel           H√† N·ªôi   
641       736           An Hotel        Nha Trang   
714       824          Mai Hotel           H√† N·ªôi   
715       824          Mai Hotel         V≈©ng T√†u   
1465     1706     My Dream Hotel            Sa Pa   
1466     1706     My Dream Hotel           ƒê√† L·∫°t   
1511     1759   Hoang Linh Hotel         V≈©ng T√†u   
1512     1759   Hoang Linh Hotel          ƒê√† N·∫µng   
2608     3033   Phuong Nam Hotel            Sa Pa   
2609     3033   Phuong Nam Hotel         V≈©ng T√†u   
3140     3632       Prague Hotel  TP. H·ªì Ch√≠ Minh   
3141     

In [6]:
import re

def get_special_chars(df):
    # G·ªôp t·∫•t c·∫£ text trong dataframe (ch·ªâ c√°c c·ªôt ki·ªÉu object ho·∫∑c string)
    text_data = ' '.join(df.select_dtypes(include=['object']).astype(str).values.flatten())
    
    # T√¨m t·∫•t c·∫£ k√Ω t·ª± kh√¥ng ph·∫£i ch·ªØ, s·ªë ho·∫∑c kho·∫£ng tr·∫Øng
    special_chars = re.findall(r'[^\w\s]', text_data)
    
    # L·∫•y unique v√† s·∫Øp x·∫øp
    unique_chars = sorted(set(special_chars))
    
    print(f"üîé C√≥ {len(unique_chars)} k√Ω t·ª± ƒë·∫∑c bi·ªát kh√°c nhau trong d·ªØ li·ªáu:\n")
    print(' '.join(unique_chars))
    return unique_chars


In [7]:
specials = get_special_chars(df)

üîé C√≥ 33 k√Ω t·ª± ƒë·∫∑c bi·ªát kh√°c nhau trong d·ªØ li·ªáu:

! " # % & ' ( ) * + , - . / : ; @ [ ] | ~ ÃÄ ÃÅ ÃÉ Ãâ Ã£ ‚Äã ‚Äì ‚Äô ‚Äú ‚Äù ‚òÜ ‚ú©


In [8]:
import re
import unicodedata

In [9]:
def clean_vietnamese_text(text):
    if not isinstance(text, str):
        return ""
        
    text = unicodedata.normalize('NFC', text)
    
    text = text.lower()

    text = text.replace('%', ' ph·∫ßn trƒÉm')
    
    text = re.sub(r'/\s*ƒë√™m', ' m·ªói ƒë√™m', text, flags=re.IGNORECASE)
    text = re.sub(r"[^\w\s\-,.]", " ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [10]:
text = "xin ch√†O"
print(clean_vietnamese_text(text))

xin ch√†o


# Embedding v·ªõi Sentence-BERT

# Load Sentence-BERT

In [11]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
# sbert_model = SentenceTransformer('keepitreal/vietnamese-sbert')

# Gh√©p c√°c tr∆∞·ªùng l·∫°i th√†nh 1 text ƒë·ªÉ embedding

In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3889 entries, 0 to 3888
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HotelID       3889 non-null   int64  
 1   Name Hotel    3889 non-null   object 
 2   Location      3889 non-null   object 
 3   Descriptions  3889 non-null   object 
 4   Rating        3889 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 152.0+ KB
None


In [13]:
df.isnull().sum()

HotelID         0
Name Hotel      0
Location        0
Descriptions    0
Rating          0
dtype: int64

In [14]:
df["TextForEmbedding"] = df["Descriptions"].apply(clean_vietnamese_text)

# Sinh embedding cho to√†n b·ªô dataset

In [15]:
embeddings = sbert_model.encode(df["TextForEmbedding"].tolist(), batch_size=32, show_progress_bar=True)
embeddings = normalize(embeddings, axis=1)  # chu·∫©n h√≥a theo h√†ng

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

In [17]:
print("Embedding shape:", embeddings.shape)

Embedding shape: (3889, 768)


In [19]:
print("‚úÖ Embedding done!")
print("Ki·ªÉu d·ªØ li·ªáu:", type(embeddings))     # numpy.ndarray
print("Shape:", embeddings.shape)            # (s·ªë_samples, s·ªë_chi·ªÅu) v√≠ d·ª• (1000, 384)
print("Dtype:", embeddings.dtype)            # float32 ho·∫∑c float64

‚úÖ Embedding done!
Ki·ªÉu d·ªØ li·ªáu: <class 'numpy.ndarray'>
Shape: (3889, 768)
Dtype: float32


In [20]:
norms = np.linalg.norm(embeddings, axis=1)
print(norms[:10])

[1.0000001  1.         0.99999994 1.0000001  1.0000001  1.
 1.         0.99999994 1.         1.        ]


# L∆∞u v√†o Milvus

In [21]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection, utility, connections

# K·∫øt n·ªëi ƒë·∫øn Milvus server

In [22]:
connections.connect(alias="default",host="localhost",port="19530")
collection_name = "hotels_collection_mpnet_base_v2"

# T·∫°o collection schema

In [23]:
# Ki·ªÉm tra xem collection ƒë√£ t·ªìn t·∫°i ch∆∞a
if utility.has_collection(collection_name):
    print(f"Collection '{collection_name}' ƒë√£ t·ªìn t·∫°i, x√≥a v√† t·∫°o m·ªõi l·∫°i...")
    utility.drop_collection(collection_name)

# T·∫°o m·ªõi collection
print(f"T·∫°o m·ªõi collection '{collection_name}'...")

fields = [
    FieldSchema(name="HotelID", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="TextForEmbedding", dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name="Location", dtype=DataType.VARCHAR, max_length=100), 
    FieldSchema(name="NameHotel", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="Description", dtype=DataType.VARCHAR, max_length=65535),
]

schema = CollectionSchema(fields=fields, description="Hotel dataset embeddings")

collection = Collection(name=collection_name, schema=schema)
print(f"Collection '{collection_name}' ƒë√£ ƒë∆∞·ª£c t·∫°o m·ªõi th√†nh c√¥ng!")

T·∫°o m·ªõi collection 'hotels_collection_mpnet_base_v2'...
Collection 'hotels_collection_mpnet_base_v2' ƒë√£ ƒë∆∞·ª£c t·∫°o m·ªõi th√†nh c√¥ng!


# Chu·∫©n b·ªã d·ªØ li·ªáu ƒë·ªÉ insert

In [24]:
hotel_ids = df["HotelID"].astype(int).tolist()
locations = df["Location"].astype(str).str.strip().tolist()
names = df["Name Hotel"].astype(str).str.strip().tolist()
descriptions = df["Descriptions"].astype(str).str.strip().tolist()

In [25]:
print(type(embeddings))
print(type(hotel_ids))
print(type(locations))
print(type(names))
print(type(descriptions))

<class 'numpy.ndarray'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


# Chunked insert

In [26]:
def chunk_data(data, size):
    for i in range(0, len(data), size):
        yield data[i:i+size]

batch_size = 100

for i, (id_batch, emb_batch, loc_batch, name_batch, des_batch) in enumerate(zip(
    chunk_data(hotel_ids, batch_size),
    chunk_data(embeddings.tolist(), batch_size),
    chunk_data(locations, batch_size),
    chunk_data(names, batch_size),
    chunk_data(descriptions, batch_size)
)):
    collection.insert([
        id_batch,
        emb_batch,
        loc_batch,
        name_batch,
        des_batch
    ])
    print(f"Inserted batch {i+1}")

collection.flush()
print(f"S·ªë l∆∞·ª£ng b·∫£n ghi: {collection.num_entities}")

Inserted batch 1
Inserted batch 2
Inserted batch 3
Inserted batch 4
Inserted batch 5
Inserted batch 6
Inserted batch 7
Inserted batch 8
Inserted batch 9
Inserted batch 10
Inserted batch 11
Inserted batch 12
Inserted batch 13
Inserted batch 14
Inserted batch 15
Inserted batch 16
Inserted batch 17
Inserted batch 18
Inserted batch 19
Inserted batch 20
Inserted batch 21
Inserted batch 22
Inserted batch 23
Inserted batch 24
Inserted batch 25
Inserted batch 26
Inserted batch 27
Inserted batch 28
Inserted batch 29
Inserted batch 30
Inserted batch 31
Inserted batch 32
Inserted batch 33
Inserted batch 34
Inserted batch 35
Inserted batch 36
Inserted batch 37
Inserted batch 38
Inserted batch 39
S·ªë l∆∞·ª£ng b·∫£n ghi: 3889


# T·∫°o index

In [27]:
index_params = {
    "index_type": "HNSW",
    "metric_type": "COSINE",
    "params": {"M": 8, "efConstruction": 64}
}
# T·∫°o index cho field vector
collection.release()
collection.drop_index()
collection.create_index(field_name="TextForEmbedding", index_params=index_params)
collection.load()

Status(code=0, message=)