In [5]:
import os
import re
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from analysis.analyzer import TTTH_Analyzer
from processor.feature import FeatureProcessor
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
# from underthesea import word_tokenize, pos_tag, sent_tokenize
from pyvi.ViTokenizer import tokenize
from pyvi import ViTokenizer
from underthesea import word_tokenize
from gensim import corpora, models, similarities
from tqdm import tqdm
_analyzer = TTTH_Analyzer()
_processor = FeatureProcessor()
warnings.filterwarnings("ignore")

In [6]:
df = pd.read_excel('data_motobikes.xlsx')

In [7]:
data = df.drop(columns=['ƒê·ªãa ch·ªâ','T√¨nh tr·∫°ng',
       'Ch√≠nh s√°ch b·∫£o h√†nh', 'Tr·ªçng l∆∞·ª£ng', 'Href'])
data.head(3)

Unnamed: 0,id,Ti√™u ƒë·ªÅ,Gi√°,Kho·∫£ng gi√° min,Kho·∫£ng gi√° max,M√¥ t·∫£ chi ti·∫øt,Th∆∞∆°ng hi·ªáu,D√≤ng xe,NƒÉm ƒëƒÉng k√Ω,S·ªë Km ƒë√£ ƒëi,Lo·∫°i xe,Dung t√≠ch xe,Xu·∫•t x·ª©
0,1,"B√°n Vespa Sprint 125cc 2024 xanh d∆∞∆°ng, xe ƒë·∫πp...",66.000.000 ƒë,72.53 tr,85.14 tr,B√°n xe #Vespa Sprint 125cc. Mua m·ªõi t·∫°i #Topco...,Piaggio,Vespa,2024,14000,Tay ga,100 - 175 cc,ƒêang c·∫≠p nh·∫≠t
1,2,üî•üî•SH 150i Th·∫Øng ABS 2019 BSTP Ch√≠nh Ch·ªß,79.500.000 ƒë,62.76 tr,73.68 tr,"_B√°n SH 150i Th·∫Øng ABS 2019 X√°m B·∫°c, √öp Team X...",Honda,SH,2019,28000,Tay ga,100 - 175 cc,ƒêang c·∫≠p nh·∫≠t
2,3,CC Vision Th·ªÉ Thao 2023 ƒêen+b·ªô ƒë√®n Demi audi A7,37.000.000 ƒë,28 tr,32.86 tr,Ch√≠nh ch·ªß b√°n Vision phi√™n b·∫£n Th·ªÉ Thao 2023 ƒê...,Honda,Vision,2023,12000,Tay ga,100 - 175 cc,ƒêang c·∫≠p nh·∫≠t


In [8]:
# S·∫Øp x·∫øp d·ªØ li·ªáu theo Th∆∞∆°ng hi·ªáu, D√≤ng xe, Lo·∫°i xe (tƒÉng d·∫ßn)
data = data.sort_values(by=['Th∆∞∆°ng hi·ªáu', 'D√≤ng xe', 'Lo·∫°i xe'], ascending=[True, True, True])
# Reset l·∫°i index sau khi s·∫Øp x·∫øp
data = data.reset_index(drop=True)
###############################

#Chu·∫©n h√≥a c·ªôt "Gi√°"
data['Gi√°'] = (
    data['Gi√°']
    .astype(str)
    .str.replace(r'[^\d]', '', regex=True)  # lo·∫°i b·ªè m·ªçi k√Ω t·ª± kh√¥ng ph·∫£i s·ªë
)
# ƒê·ªïi chu·ªói r·ªóng th√†nh NaN
data.loc[data['Gi√°'] == '', 'Gi√°'] = np.nan
# √âp ki·ªÉu float v√† chia cho 1,000,000 ƒë·ªÉ ra ƒë∆°n v·ªã tri·ªáu
data['Gi√°'] = data['Gi√°'].astype(float) / 1_000_000 

for col in ['Kho·∫£ng gi√° min', 'Kho·∫£ng gi√° max']:
    data[col] = (
        data[col]
        .astype(str)
        .str.replace('tr', '', case=False, regex=False)  # b·ªè ch·ªØ "tr"
        .str.replace(',', '.')  # n·∫øu c√≥ d·∫•u ph·∫©y
        .str.strip()  # b·ªè kho·∫£ng tr·∫Øng
    )

    # ƒê·ªïi chu·ªói r·ªóng th√†nh NaN r·ªìi √©p ki·ªÉu float
    data.loc[data[col] == '', col] = np.nan
    data[col] = data[col].astype(float)
###############################
data_clean = data.copy()
# 1. X√≥a d√≤ng thi·∫øu ti√™u ƒë·ªÅ ho·∫∑c gi√°
data_clean = data_clean.dropna(subset=['Ti√™u ƒë·ªÅ', 'Gi√°'])

# 2. ƒêi·ªÅn kho·∫£ng gi√° min/max b·∫±ng c·ªôt Gi√°
data_clean['Kho·∫£ng gi√° min'] = data_clean['Kho·∫£ng gi√° min'].fillna(data_clean['Gi√°'])
data_clean['Kho·∫£ng gi√° max'] = data_clean['Kho·∫£ng gi√° max'].fillna(data_clean['Gi√°'])

# 3. N·∫øu v·∫´n c√≤n NaN, ƒëi·ªÅn median theo Th∆∞∆°ng hi·ªáu
data_clean['Kho·∫£ng gi√° min'] = data_clean.groupby('Th∆∞∆°ng hi·ªáu')['Kho·∫£ng gi√° min'].transform(
    lambda x: x.fillna(x.median())
)
data_clean['Kho·∫£ng gi√° max'] = data_clean.groupby('Th∆∞∆°ng hi·ªáu')['Kho·∫£ng gi√° max'].transform(
    lambda x: x.fillna(x.median())
)
#############################################################

def price_segment(price):
    """
    Ph√¢n lo·∫°i xe theo ph√¢n kh√∫c gi√°.
    - Ph·ªï th√¥ng: < 70 tri·ªáu
    - C·∫≠n cao c·∫•p: 70‚Äì200 tri·ªáu
    - Cao c·∫•p: > 200 tri·ªáu
    """
    if price < 70:
        return "Ph·ªï th√¥ng"
    elif price < 200:
        return "C·∫≠n cao c·∫•p"
    else:
        return "Cao c·∫•p"

data_clean["Ph√¢n kh√∫c gi√°"] = data_clean["Gi√°"].apply(price_segment)
##############################################################

# Chuy·ªÉn v·ªÅ numeric
data_clean[['Gi√°', 'Kho·∫£ng gi√° min', 'Kho·∫£ng gi√° max']] = data_clean[
    ['Gi√°', 'Kho·∫£ng gi√° min', 'Kho·∫£ng gi√° max']
].astype(float)
# L·ªçc b·ªè c√°c gi√° b·∫•t th∆∞·ªùng
data_clean = data_clean[(data_clean['Gi√°'] > 1) & (data_clean['Gi√°'] < 5000)]
################################################################

# X·ª≠ l√Ω c·ªôt S·ªë Km ƒë√£ ƒëi
data_clean.loc[data_clean['S·ªë Km ƒë√£ ƒëi'] > 99999, 'S·ªë Km ƒë√£ ƒëi'] = 99999
################################################################


# L√†m s·∫°ch text
for col in ['Th∆∞∆°ng hi·ªáu', 'D√≤ng xe', 'Lo·∫°i xe', 'Dung t√≠ch xe', 'Xu·∫•t x·ª©', 'Ph√¢n kh√∫c gi√°']:
    data_clean[col] = data_clean[col].str.strip().str.title()

# Dung t√≠ch xe: map ƒë·ªãnh l∆∞·ª£ng
def parse_cc(val):
    if 'D∆∞·ªõi' in val: return 40
    if '50 - 100' in val: return 75
    if '100 - 175' in val: return 137
    if 'Tr√™n 175' in val: return 200
    return np.nan
data_clean['cc_numeric'] = data_clean['Dung t√≠ch xe'].apply(parse_cc)
######################################################################

# Ph√¢n kh√∫c gi√°: map ordinal
price_segment_map = {'Ph·ªï Th√¥ng': 1, 'C·∫≠n Cao C·∫•p': 2, 'Cao C·∫•p': 3}
data_clean['price_segment_code'] = data_clean['Ph√¢n kh√∫c gi√°'].map(price_segment_map)
#######################################################################

# Thay c√°c gi√° tr·ªã ƒë·∫∑c bi·ªát trong c·ªôt NƒÉm ƒëƒÉng k√Ω
data_clean['NƒÉm ƒëƒÉng k√Ω'] = data_clean['NƒÉm ƒëƒÉng k√Ω'].replace({
    'tr∆∞·ªõc nƒÉm 1980': '1979',
    'ƒêang c·∫≠p nh·∫≠t': np.nan,
    'Kh√¥ng r√µ': np.nan
})
# Chuy·ªÉn sang ki·ªÉu int
data_clean['NƒÉm ƒëƒÉng k√Ω'] = pd.to_numeric(data_clean['NƒÉm ƒëƒÉng k√Ω'], errors='coerce')
data_clean['NƒÉm ƒëƒÉng k√Ω'] = data_clean['NƒÉm ƒëƒÉng k√Ω'].astype(int)

min_age = 0.5  # t√≠nh tr√≤n cho 6 th√°ng
data_clean['age'] = 2025 - data_clean['NƒÉm ƒëƒÉng k√Ω']

# Thay age == 0 b·∫±ng min_age
data_clean.loc[data_clean['age'] <= 0, 'age'] = min_age
#######################################################################

# X·ª≠ l√Ω missing values cho c·ªôt cc_numeric
_processor.handle_missing_values_by_median('cc_numeric', data_clean)

cc_numeric before fill missing values: 65
cc_numeric after fill missing values: 0


In [9]:
numeric_cols = [
    "Gi√°", "Kho·∫£ng gi√° min", "Kho·∫£ng gi√° max",
    "S·ªë Km ƒë√£ ƒëi", "age", "cc_numeric"]


# Danh s√°ch th∆∞∆°ng hi·ªáu m√¥ t√¥ cao c·∫•p
premium_brands = ['BMW', 'Harley Davidson', 'Ducati', 'Triumph', 'Kawasaki', 'Benelli']

# √Åp d·ª•ng ng∆∞·ª°ng gi√° t·ªëi ƒëa cho xe ph·ªï th√¥ng
data_clean.loc[
    (~data_clean['Th∆∞∆°ng hi·ªáu'].isin(premium_brands)) & (data_clean['Gi√°'] > 300),
    'Gi√°'
] = 300

# Ph√°t hi·ªán outliers s·ª≠ d·ª•ng IQR
Q1 = data_clean[numeric_cols].quantile(0.25)
Q3 = data_clean[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
outlier_mask = (data_clean[numeric_cols] < (Q1 - 1.5 * IQR)) | (data_clean[numeric_cols] > (Q3 + 1.5 * IQR))
outlier_counts = outlier_mask.sum().sort_values(ascending=False)

In [10]:
# ƒê·ªãnh nghƒ©a c√°c nh√≥m keyword

# Nh√≥m 1: M·ªöI / T√åNH TR·∫†NG XE
kw_moi = [
    "m·ªõi", "c√≤n m·ªõi", "nh∆∞ m·ªõi", "m·ªõi 95", "m·ªõi 99", "m·ªõi tinh",
    "xe l∆∞·ªõt", "xe √≠t ƒëi", "√≠t s·ª≠ d·ª•ng", "xe ƒë·ªÉ kh√¥ng", "ƒë·ªÉ kho",
    "keng", "leng keng", "nguy√™n zin", "zin 100%", "zin nguy√™n b·∫£n",
    "d√°n keo", "d√°n ppf", "ngo·∫°i h√¨nh ƒë·∫πp", "d√†n √°o li·ªÅn l·∫°c", "ƒë·∫πp nh∆∞ h√¨nh"
]
# Nh√≥m 2: ƒê·ªò XE / ƒê·ªí CH∆†I / N√ÇNG C·∫§P
kw_do_xe = [
    "ƒë·ªô", "ƒë·ªì ch∆°i", "full ƒë·ªì", "p√¥ ƒë·ªô", "p√¥ m√≥c", "phu·ªôc rcb", "tay th·∫Øng",
    "l√™n ƒë·ªì", "tem ƒë·ªô", "l√™n full ƒë·ªì", "ƒë·ªì zin c√≤n ƒë·ªß", "k√≠nh gi√≥", "th√πng givi",
    "·ªëc titan", "m√£o gi√≥", "bao tay", "tr·ª£ l·ª±c", "ƒë·ªô m√°y"
]
# Nh√≥m 3: M·ª®C ƒê·ªò S·ª¨ D·ª§NG
kw_su_dung = [
    "√≠t ƒëi", "ƒëi l√†m", "ƒëi h·ªçc", "ƒëi ph∆∞·ª£t", "ƒëi c√† ph√™", "ƒë·ªÉ kh√¥ng",
    "√≠t s·ª≠ d·ª•ng", "xe gia ƒë√¨nh", "xe c√¥ng ty", "d∆∞ xe", "ƒëi l·∫°i nh·∫π nh√†ng",
    "xe n·ªØ d√πng", "xe n·ªØ ch·∫°y", "xe ƒë·ªÉ l√¢u", "√≠t ch·∫°y", "ƒëi g·∫ßn"
]
# Nh√≥m 4: B·∫¢O D∆Ø·ª†NG / S·ª¨A CH·ªÆA
kw_bao_duong = [
    "b·∫£o d∆∞·ª°ng", "b·∫£o tr√¨", "thay nh·ªõt", "v·ªá sinh", "bao test", "ƒëi b·∫£o d∆∞·ª°ng",
    "b·∫£o d∆∞·ª°ng ƒë·ªãnh k·ª≥", "m·ªõi thay b√¨nh", "m·ªõi l√†m n·ªìi", "ƒë√£ l√†m l·∫°i m√°y",
    "thay b·ªë th·∫Øng", "thay l·ªçc", "b·∫£o d∆∞·ª°ng l·ªõn", "ch·ªânh s√™n", "xe k·ªπ"
]
# Nh√≥m 5: ƒê·ªò B·ªÄN / M√ÅY M√ìC / CH·∫§T L∆Ø·ª¢NG
kw_do_ben = [
    "m√°y √™m", "n·ªï √™m", "ch·∫°y √™m", "m√°y m·∫°nh", "m√°y b·ªëc", "ti·∫øt ki·ªám xƒÉng",
    "·ªïn ƒë·ªãnh", "ch·∫°y ngon", "kh√¥ng x√¨ nh·ªõt", "kh√¥ng r√≤ r·ªâ", "kh√¥ng l·ªói",
    "m√°y kh√¥ r√°o", "m√°y t·ªët", "ch·∫°y m∆∞·ª£t", "v·∫≠n h√†nh ·ªïn ƒë·ªãnh", "√™m √°i",
    "b·ªÅn b·ªâ", "m√°y m√≥c zin", "ch·∫°y b√¨nh th∆∞·ªùng", "ho·∫°t ƒë·ªông t·ªët"
]
# Nh√≥m 6: GI·∫§Y T·ªú / PH√ÅP L√ù
kw_phap_ly = [
    "ch√≠nh ch·ªß", "·ªßy quy·ªÅn", "bao sang t√™n", "c√† v·∫πt", "gi·∫•y t·ªù ƒë·∫ßy ƒë·ªß",
    "gi·∫•y t·ªù h·ª£p l·ªá", "h·ªì s∆° g·ªëc", "bstp", "bao c√¥ng ch·ª©ng", 
    "bao tranh ch·∫•p", "ra t√™n", "cavet", "h·ª£p ph√°p"
]


In [11]:
# H√†m check t·ª´ kh√≥a xu·∫•t hi·ªán trong m√¥ t·∫£
def keyword_flag(text: str, keywords: list[str]) -> int:
    """
    Ki·ªÉm tra xem text c√≥ ch·ª©a √≠t nh·∫•t 1 t·ª´ kh√≥a trong danh s√°ch kh√¥ng.
    Tr·∫£ v·ªÅ 1 n·∫øu c√≥, 0 n·∫øu kh√¥ng.
    """
    if pd.isna(text):
        return 0
    text = text.lower()
    return int(any(re.search(rf"(?<!\w){re.escape(kw)}(?!\w)", text) for kw in keywords))

# L√†m s·∫°ch v√† chu·∫©n h√≥a vƒÉn b·∫£n
def clean_text(text: str) -> str:
    """
    Chu·∫©n h√≥a m√¥ t·∫£:
    - Chuy·ªÉn v·ªÅ ch·ªØ th∆∞·ªùng
    - B·ªè URL, k√Ω t·ª± ƒë·∫∑c bi·ªát, s·ªë
    - Chu·∫©n h√≥a kho·∫£ng tr·∫Øng
    """
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Lo·∫°i b·ªè stopwords ti·∫øng Vi·ªát
vietnamese_stopwords = set([
    "xe", "m√°y", "b√°n", "c·∫ßn", "mua", "b√°o", "li√™n", "h·ªá", "anh", "ch·ªã",
    "em", "mn", "m·ªçi", "ng∆∞·ªùi", "xin", "c·∫£m", "∆°n", "ch·ª£", "t·ªët", "ƒë·∫ßy",
    "ƒë·ªß", "ƒëi·ªán", "tho·∫°i", "ƒë·ªãa", "ch·ªâ", "s·ªë", "c·ªßa", "v√†", "v·ªõi", "c√≤n",
    "th√¨", "n√™n", "r·∫•t", "ƒë√£", "ƒë∆∞·ª£c", "ko", "kg", "th·∫≠t", "l√†", "th√¥i",
    "nha", "nh√©", "·∫°", "nh∆∞ng", "b·ªüi", "v√¨", "th√¨", "n√†o", "v·∫≠y"
])

def remove_stopwords(text: str) -> str:
    words = text.split()
    return " ".join([w for w in words if w not in vietnamese_stopwords])

In [12]:
# √Åp d·ª•ng NLP
data_clean["desc_clean"] = data_clean["M√¥ t·∫£ chi ti·∫øt"].apply(clean_text)
data_clean["desc_clean"] = data_clean["desc_clean"].apply(remove_stopwords)


# √Åp d·ª•ng t·∫°o ƒë·∫∑c tr∆∞ng m·ªõi
data_clean["is_moi"] = data_clean["desc_clean"].apply(lambda x: keyword_flag(x, kw_moi))
data_clean["is_do_xe"] = data_clean["desc_clean"].apply(lambda x: keyword_flag(x, kw_do_xe))
data_clean["is_su_dung_nhieu"] = data_clean["desc_clean"].apply(lambda x: keyword_flag(x, kw_su_dung))
data_clean["is_bao_duong"] = data_clean["desc_clean"].apply(lambda x: keyword_flag(x, kw_bao_duong))
data_clean["is_do_ben"] = data_clean["desc_clean"].apply(lambda x: keyword_flag(x, kw_do_ben))
data_clean["is_phap_ly"] = data_clean["desc_clean"].apply(lambda x: keyword_flag(x, kw_phap_ly))

In [14]:
stop_word_file = 'files/vietnamese-stopwords.txt'
emojicon_file = 'files/emojicon.txt'
teencode_file = 'files/teencode.txt'

# Load stopwords, emojicons, teencode mappings
with open(stop_word_file, 'r', encoding='utf-8') as f:
    stopwords = set([w.strip() for w in f.readlines() if w.strip()])

with open(emojicon_file, 'r', encoding='utf-8') as f:
    emojicons = [w.strip() for w in f.readlines() if w.strip()]

with open(teencode_file, 'r', encoding='utf-8') as f:
    teencode_map = {}
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 2:
            teencode_map[parts[0]] = " ".join(parts[1:])


special_tokens = ['', ' ', ',', '.', '...', '-', ':', ';', '?', '%', '(', ')', '+', '/', "'", '&', '#', '*', '!', '"', '_', '=', '[', ']', '{', '}', '~', '`', '|', '\\']


In [15]:
# C√°c h√†m x·ª≠ l√Ω
def remove_emojis(text):
    for emo in emojicons:
        text = text.replace(emo, ' ')
    return text

def normalize_teencode(text):
    for key, val in teencode_map.items():
        text = re.sub(rf'\b{re.escape(key)}\b', val, text)
    return text

def remove_special_chars(text):
    text = re.sub(r'[^\w\s]', ' ', text)  # lo·∫°i k√Ω t·ª± ƒë·∫∑c bi·ªát
    text = re.sub(r'\s+', ' ', text).strip()  # lo·∫°i kho·∫£ng tr·∫Øng th·ª´a
    return text

# -----------------------
# 4. T√ÅCH STOPWORD RI√äNG
# -----------------------
def remove_stopwords(text):
    tokens = word_tokenize(text, format="text").split()
    tokens = [t for t in tokens if t not in stopwords]
    return ' '.join(tokens)

# -----------------------
# 5. CHU·∫®N H√ìA T·ªîNG H·ª¢P
# -----------------------
def clean_text(text):
    text = str(text).lower()
    text = remove_emojis(text)
    text = normalize_teencode(text)
    text = remove_special_chars(text)
    text = remove_stopwords(text)
    return text

In [16]:
data_clean['Content'] = data_clean['M√¥ t·∫£ chi ti·∫øt'].apply(lambda x: ' '.join(x.split()[:200]))

data_clean['clean_text'] = data_clean['Content'].apply(clean_text)

In [17]:
data_clean.columns

Index(['id', 'Ti√™u ƒë·ªÅ', 'Gi√°', 'Kho·∫£ng gi√° min', 'Kho·∫£ng gi√° max',
       'M√¥ t·∫£ chi ti·∫øt', 'Th∆∞∆°ng hi·ªáu', 'D√≤ng xe', 'NƒÉm ƒëƒÉng k√Ω',
       'S·ªë Km ƒë√£ ƒëi', 'Lo·∫°i xe', 'Dung t√≠ch xe', 'Xu·∫•t x·ª©', 'Ph√¢n kh√∫c gi√°',
       'cc_numeric', 'price_segment_code', 'age', 'desc_clean', 'is_moi',
       'is_do_xe', 'is_su_dung_nhieu', 'is_bao_duong', 'is_do_ben',
       'is_phap_ly', 'Content', 'clean_text'],
      dtype='object')

In [18]:
vectorizer = TfidfVectorizer(
    analyzer='word',
    max_features=8000
)
tfidf_matrix = vectorizer.fit_transform(data_clean['clean_text'])
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend(item_id: int, top_n: int = 5):
    """
    Recommend similar motorbikes based on cosine similarity.
    Args:
        item_id (int): id ho·∫∑c index c·ªßa xe trong DataFrame
        top_n (int): s·ªë l∆∞·ª£ng g·ª£i √Ω mu·ªën l·∫•y
    Returns:
        DataFrame ch·ª©a c√°c xe t∆∞∆°ng t·ª±
    """
    if item_id not in data.index:
        raise ValueError(f"id {item_id} kh√¥ng t·ªìn t·∫°i trong DataFrame")

    # L·∫•y h√†ng t∆∞∆°ng ·ª©ng trong ma tr·∫≠n cosine
    sim_scores = list(enumerate(cosine_sim_matrix[item_id]))

    # S·∫Øp x·∫øp theo ƒë·ªô t∆∞∆°ng ƒë·ªìng gi·∫£m d·∫ßn, b·ªè ch√≠nh n√≥
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: top_n + 1]

    # L·∫•y index xe t∆∞∆°ng t·ª±
    similar_indices = [i[0] for i in sim_scores]
    similar_scores = [i[1] for i in sim_scores]

    # T·∫°o DataFrame k·∫øt qu·∫£
    recommendations = data.loc[similar_indices, ['id', 'Ti√™u ƒë·ªÅ', 'Content']].copy()
    recommendations['similarity'] = similar_scores
    return recommendations.reset_index(drop=True)

def recommend_cosine_by_text(query: str, top_n: int = 5):
    """
    G·ª£i √Ω xe m√°y t∆∞∆°ng t·ª± d·ª±a tr√™n vƒÉn b·∫£n ng∆∞·ªùi d√πng nh·∫≠p v√†o.
    
    Args:
        query (str): vƒÉn b·∫£n t√¨m ki·∫øm
        top_n (int): s·ªë l∆∞·ª£ng g·ª£i √Ω
    
    Returns:
        DataFrame: danh s√°ch xe t∆∞∆°ng t·ª± + ƒë·ªô t∆∞∆°ng ƒë·ªìng
    """

    # 1. Ti·ªÅn x·ª≠ l√Ω query b·∫±ng h√†m clean_text c·ªßa b·∫°n
    clean_query = clean_text(query)

    # 2. Vector h√≥a query
    query_vec = vectorizer.transform([clean_query])

    # 3. T√≠nh ƒë·ªô t∆∞∆°ng ƒë·ªìng cosine gi·ªØa query v√† to√†n b·ªô item
    sims = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # 4. L·∫•y top N k·∫øt qu·∫£ cao nh·∫•t
    top_idx = sims.argsort()[::-1][:top_n]
    top_scores = sims[top_idx]

    # 5. Tr·∫£ v·ªÅ DataFrame k·∫øt qu·∫£
    result = data.iloc[top_idx][['id', 'Ti√™u ƒë·ªÅ', 'Content']].copy()
    result["similarity"] = top_scores

    return result.reset_index(drop=True)
