In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key= os.getenv('GOOGLE_API_KEY')
if not api_key :
    raise ValueError('GOOGLE_API_KEY not found!')

In [2]:
import os
import glob
from langchain_core.documents import Document

# ƒê∆∞·ªùng d·∫´n th∆∞ m·ª•c Dataset
DATASET_DIR = './Dataset_economy'

# ƒê·ªçc t·∫•t c·∫£ file .txt t·ª´ Dataset_economy (bao g·ªìm c√°c th∆∞ m·ª•c con)
documents = []
for filepath in glob.glob(os.path.join(DATASET_DIR, '**', '*.txt'), recursive=True):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read().strip()
        if content:  # B·ªè qua file r·ªóng
            # L·∫•y t√™n danh m·ª•c t·ª´ ƒë∆∞·ªùng d·∫´n th∆∞ m·ª•c
            parts = filepath.replace(DATASET_DIR, '').strip('/').split('/')
            category = parts[0] if len(parts) > 1 else 'unknown'
            
            documents.append(Document(
                page_content=content,
                metadata={
                    "source": filepath,
                    "category": category,
                    "filename": os.path.basename(filepath)
                }
            ))
    except Exception as e:
        print(f"L·ªói ƒë·ªçc file {filepath}: {e}")

print(f"ƒê√£ load {len(documents)} vƒÉn b·∫£n t·ª´ {DATASET_DIR}")

ƒê√£ load 27682 vƒÉn b·∫£n t·ª´ ./Dataset_economy


In [3]:
from typing import List
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.embeddings import Embeddings


class E5EmbeddingsWrapper(Embeddings):
    def __init__(self, base_embeddings):
        self._base = base_embeddings

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        texts = ["passage: " + t for t in texts]
        return self._base.embed_documents(texts)

    def embed_query(self, text: str) -> List[float]:
        return self._base.embed_query("query: " + text)


# ====== Kh·ªüi t·∫°o model ======
model = "intfloat/multilingual-e5-base"

base_embeddings = HuggingFaceEmbeddings(
    model_name=model,
    model_kwargs={"device": "cuda"},  # ƒë·ªïi th√†nh "cpu" n·∫øu c·∫ßn
    encode_kwargs={
        "normalize_embeddings": True
    }
)

embeddings = E5EmbeddingsWrapper(base_embeddings)

2026-02-27 16:34:58.427849: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-02-27 16:34:58.464027: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-02-27 16:34:59.223590: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
import os
import shutil
import hashlib
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma.vectorstores import Chroma


PERSIST_DIR = "./chroma_economy_db"
FORCE_REBUILD = True   # ƒê·ªïi th√†nh False n·∫øu kh√¥ng mu·ªën rebuild
K_RETRIEVE = 40

splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=80,
    separators=['\n\n', '\n']
)

doc_splits = splitter.split_documents(documents)
print(f"‚úÇÔ∏è T·ªïng s·ªë chunks tr∆∞·ªõc dedup: {len(doc_splits)}")

seen_hashes = set()
unique_splits = []

for d in doc_splits:
    content_hash = hashlib.md5(
        d.page_content.strip().encode("utf-8")
    ).hexdigest()

    if content_hash not in seen_hashes:
        seen_hashes.add(content_hash)
        unique_splits.append(d)

removed = len(doc_splits) - len(unique_splits)
doc_splits = unique_splits

print(f"üîÑ Dedup: {len(doc_splits) + removed} ‚Üí {len(doc_splits)} (lo·∫°i {removed})")


if FORCE_REBUILD and os.path.exists(PERSIST_DIR):
    shutil.rmtree(PERSIST_DIR)
    print(f"üóëÔ∏è ƒê√£ x√≥a DB c≈© t·∫°i {PERSIST_DIR}")

if os.path.exists(PERSIST_DIR):
    print("üìÇ ƒêang load DB c≈©...")
    chroma = Chroma(
        persist_directory=PERSIST_DIR,
        embedding_function=embeddings
    )
else:
    print("üÜï ƒêang t·∫°o DB m·ªõi...")
    chroma = Chroma.from_documents(
        documents=doc_splits,
        embedding=embeddings,
        persist_directory=PERSIST_DIR
    )

print(f"‚úÖ DB hi·ªán c√≥ {chroma._collection.count()} vectors")


retriever_cosine = chroma.as_retriever(
    search_kwargs={'k': K_RETRIEVE}
)

print(f"üîé Retriever s·∫µn s√†ng v·ªõi k={K_RETRIEVE}")

‚úÇÔ∏è T·ªïng s·ªë chunks tr∆∞·ªõc dedup: 306296
üîÑ Dedup: 306296 ‚Üí 120165 (lo·∫°i 186131)
üÜï ƒêang t·∫°o DB m·ªõi...
‚úÖ DB hi·ªán c√≥ 120165 vectors
üîé Retriever s·∫µn s√†ng v·ªõi k=40


In [5]:
import numpy as np
from scipy.spatial.distance import cdist

def energy_base_distance(X,Y):
    
    X=np.asarray(X)
    Y=np.asarray(Y)
#Tuong tac cheo
    d_xy=cdist(X,Y,metric='euclidean')
    E_xy=np.mean(d_xy)
#Tu nang x
    d_xx=cdist(X,X,metric='euclidean')
    E_xx=np.mean(d_xx)
#Tu nang y
    d_yy=cdist(Y,Y,metric='euclidean')
    E_yy=np.mean(d_yy)
#Energy_distance
    ED=2*E_xy - E_xx - E_yy

    return max(0,ED)

In [6]:
X = [[1,1],[2,3]]
Y = [[1,1],[2,3]]

ed=energy_base_distance(X,Y)
print(f'{ed}')

0


In [7]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_with_energy_clustering(query, retriever, embeddings, k_clusters=7):

    docs=retriever.invoke(query)

    context=[doc.page_content for doc in docs]

    doc_vector= np.array(embeddings.embed_documents(context))
    query_vector=np.array(embeddings.embed_query(query)).reshape(1,-1)
#kiem loi
    sims = cosine_similarity(query_vector, doc_vector)[0]
    print("Max cosine:", np.max(sims))
    if np.max(sims) < 0.40:
        return [] 
#tao cum
    actual_k=min(k_clusters,len(doc_vector))
    kmeans=KMeans(n_clusters= actual_k, random_state=42, n_init='auto')
    labels=kmeans.fit_predict(doc_vector)
#kiem tra e
    best_energy=float('inf')
    best_cluster_idx=-1
    for i in range(actual_k):
        indices=np.where(labels==i)[0]
        cluster_vector=doc_vector[indices]

        energy=energy_base_distance(query_vector,cluster_vector)

        if energy < best_energy:
            best_energy=energy
            best_cluster_idx=i
#tim idx e thap
    win_index=np.where(labels==best_cluster_idx)[0]
    print(f'{best_energy}')
#doan context e thap
    final_docs=[context[i] for i in win_index]

    return final_docs

In [8]:
query='M·ª•c ti√™u doanh thu v√† l·ª£i nhu·∫≠n nƒÉm 2024 c·ªßa Ph·ª•c H∆∞ng Holdings (PHC) ƒë∆∞·ª£c ƒë·∫∑t ra l√† bao nhi√™u?'
context_doc=retrieve_with_energy_clustering(query,retriever_cosine,embeddings)
print(f'{context_doc}')

Max cosine: 0.8926264617966007
0.6915864902417015
['Nh·ªù th·ª±c hi·ªán t·ªët c√¥ng t√°c qu·∫£n tr·ªã bi·∫øn ƒë·ªông n√™n m·∫∑c d√π gi√° c√°c s·∫£n ph·∫©m ch·ªß l·ª±c c·ªßa T·∫≠p ƒëo√†n 2 th√°ng ƒë·∫ßu nƒÉm 2024 gi·∫£m t·ª´ 3 - 15% so v·ªõi c√πng k·ª≥ nƒÉm 2023 nh∆∞ng t·∫•t c·∫£ c√°c ch·ªâ ti√™u t√†i ch√≠nh c·ªßa T·∫≠p ƒëo√†n ƒë·ªÅu ho√†n th√†nh v∆∞·ª£t m·ª©c t·ª´ 18 - 48% k·∫ø ho·∫°ch, tƒÉng tr∆∞·ªüng cao so v·ªõi c√πng k·ª≥. T·ªïng doanh thu to√†n T·∫≠p ƒëo√†n ∆∞·ªõc ƒë·∫°t 149,6 ngh√¨n t·ª∑ ƒë·ªìng, v∆∞·ª£t 28%, tƒÉng 19% so v·ªõi c√πng k·ª≥ nƒÉm 2023; N·ªôp ng√¢n s√°ch to√†n T·∫≠p ƒëo√†n ∆∞·ªõc ƒë·∫°t 20,6 ngh√¨n t·ª∑ ƒë·ªìng, v∆∞·ª£t 18%, tƒÉng 5% so v·ªõi c√πng k·ª≥ nƒÉm 2023.', 'Theo ƒë√≥, t√¥Ãâng doanh thu phiÃÅ baÃâo hi√™Ãâm g√¥ÃÅc cuÃâa thiÃ£ truÃõ∆°ÃÄng phi nhaÃÇn thoÃ£ 3 thaÃÅng ƒë√¢ÃÄu naÃÜm 2024 uÃõ∆°ÃÅc ƒëaÃ£t 20.012 tyÃâ ƒë√¥ÃÄng, taÃÜng 12,25% so v∆°ÃÅi cuÃÄng kyÃÄ naÃÜm 2023.\nD√¢ÃÉn ƒë√¢ÃÄu thiÃ£ truÃõ∆°ÃÄng v√™ÃÄ doanh thu phiÃÅ g√¥ÃÅc laÃÄ PVI v∆°ÃÅi doanh thu uÃõ∆

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
def answer(question,context,embeddings):
    # model='gemini-2.0-flash'
    model='gemini-2.5-flash'
    llm=ChatGoogleGenerativeAI(model=model)
    context_ed= retrieve_with_energy_clustering(question,context,embeddings)

    if not context_ed:
        return 'Khong co thong tin'

    template='''D·ª±a v√†o c√°c ƒëo·∫°n vƒÉn sau:

 {context}

 Tr·∫£ l·ªùi c√¢u h·ªèi: {query}

 Y√äU C·∫¶U B·∫ÆT BU·ªòC:
 - Tr·∫£ l·ªùi NG·∫ÆN G·ªåN, ƒëi th·∫≥ng v√†o v·∫•n ƒë·ªÅ
 - KH√îNG l·∫∑p l·∫°i c√¢u h·ªèi trong c√¢u tr·∫£ l·ªùi
 - KH√îNG b·∫Øt ƒë·∫ßu b·∫±ng "D·ª±a v√†o ƒëo·∫°n vƒÉn..."
 - Tr√≠ch d·∫´n ƒê·∫¶Y ƒê·ª¶ t·∫•t c·∫£ s·ªë li·ªáu li√™n quan (%, t·ª∑ ƒë·ªìng, m·ª©c tƒÉng/gi·∫£m, m·ª©c ƒë√≠ch...)
 - N·∫øu c√≥ nhi·ªÅu th√¥ng tin li√™n quan, t·ªïng h·ª£p H·∫æT
 - N·∫øu kh√¥ng c√≥ th√¥ng tin, tr·∫£ l·ªùi: "Kh√¥ng c√≥ th√¥ng tin v·ªÅ..."

 C√¢u tr·∫£ l·ªùi:'''
    prompt=ChatPromptTemplate.from_template(template)
    chain = prompt | llm | StrOutputParser()

    return chain.invoke({'context' : context_ed,
                        'query':question                    
    })

In [11]:
# y='So s√°nh t·ªëc ƒë·ªô tƒÉng tr∆∞·ªüng gi·ªØa doanh thu v√† l·ª£i nhu·∫≠n c·ªßa H√≤a Ph√°t nƒÉm 2025?'
# tl=answer(y,retriever_cosine,embeddings)
# tl

In [None]:
# while True:
#     q=input('nhap cau hoi')
#     if q.strip().lower() == "quit":        
#         break
#     else:
#         tl=answer(q,retriever_cosine,embeddings)
#         print(f'cau tra loi: {tl}')