In [12]:
import chromadb
from langchain_community.vectorstores import Chroma 



from pathlib import Path



from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

# Client persistent

In [13]:
client = chromadb.PersistentClient(path="/home/chwenjun225/projects/DeepEngine/src/DeepEngine/chromadb_storage")

# Client Http

In [None]:
client = chromadb.HttpClient(host="127.0.0.1", port=2027)

# Kiểm tra kết nối bằng `heartbeat()`

In [14]:
heartbeat_time = client.heartbeat()
print(f"ChromaDB client is active. Heartbeat: {heartbeat_time} ns")

ChromaDB client is active. Heartbeat: 1741753736993402274 ns


# Tạo (hoặc lấy) collection để lưu dữ liệu

In [16]:
def upload_data_2_chromadb(
	host: str = "127.0.0.1",
	port: int = 2027,
	chunk_size: int = 500,
	chunk_overlap: int = 50,
	name_embedd_model: str = "sentence-transformers/all-MiniLM-L6-v2",
	name_of_collection: str = "foxconn_ai_research",
	path_data: str = "/home/chwenjun225/projects/DeepEngine/docs/pdf",
	persist_directory: str = "/home/chwenjun225/projects/DeepEngine/src/DeepEngine/chromadb_storage"
) -> None:
	"""Tải dữ liệu từ file vào ChromaDB."""
	emb_fn = HuggingFaceEmbeddings(model_name=name_embedd_model)
	path_files = list(Path(path_data).glob("*.pdf"))
	if not path_files:
		print(">>> [ERROR] Không tìm thấy file nào trong thư mục.")
		return
	client = chromadb.PersistentClient(path=persist_directory)
	collection = client.get_or_create_collection(name=name_of_collection)
	for file_path in path_files:
		loader = TextLoader(str(file_path))
		documents = loader.load()
		text_splitter = RecursiveCharacterTextSplitter(
			chunk_size=chunk_size,
			chunk_overlap=chunk_overlap
		)
		split_docs = text_splitter.split_documents(documents)
		vector_db = Chroma.from_documents(
			documents=split_docs, 
			embedding=emb_fn,
			persist_directory=persist_directory
		)
		vector_db.persist()
		ids = [f"{file_path.stem}_{i}" for i in range(len(split_docs))]
		texts = [doc.page_content for doc in split_docs]
		collection.add(
			ids=ids,
			documents=texts,
			metadatas=[{"source": str(file_path)} for _ in split_docs]
		)
		print(f">>> Đã load {len(split_docs)} đoạn văn bản vào ChromaDB thành công.")

upload_data_2_chromadb()

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

# Tìm kiếm tài liệu tương thích 

In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings 



emb_fn = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



def get_retriever(num_relate_docs, embedding_model_name, persist_directory):
	embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

	vector_db = Chroma(
		persist_directory="/home/chwenjun225/projects/DeepEngine/src/DeepEngine/chromadb_storage",
		embedding_function=emb_fn,
	)

	retriever = vector_db.as_retriever(search_kwargs={"k": num_relate_docs})
	return retriever

# Cách Lưu Output Cuộc Trò Chuyện

In [None]:
import datetime

def log_chat(user_input, assistant_response):
	"""Lưu hội thoại vào file."""
	timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	with open("chat_history.log", "a") as f:
		f.write(f"[{timestamp}] 👨‍💻 User: {user_input}\n")
		f.write(f"[{timestamp}] 🤖 Assistant: {assistant_response}\n\n")
	log_chat(user_input, assistant_response)

# Tíc hợp mọi thứ 

In [None]:
from openai import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
import datetime

def get_llm(port, host, openai_api_key, model_name, temperature):
	"""Khởi tạo LLM"""
	openai_api_base = f"http://{host}:{port}"
	return ChatOpenAI(
		model=model_name,
		openai_api_base=openai_api_base,
		openai_api_key=openai_api_key,
		temperature=temperature
	)

def get_retriever(num_relate_docs, embedding_model_name, persist_directory):
	"""Lấy retriever từ ChromaDB"""
	embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
	vector_db = Chroma(
		persist_directory=persist_directory,
		embedding_function=embedding_model,
	)
	retriever = vector_db.as_retriever(search_kwargs={"k": num_relate_docs})
	return retriever

def log_chat(user_input, assistant_response):
	"""Lưu hội thoại vào file."""
	timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	with open("chat_history.log", "a") as f:
		f.write(f"[{timestamp}] 👨‍💻 User: {user_input}\n")
		f.write(f"[{timestamp}] 🤖 Assistant: {assistant_response}\n\n")

def get_chain(port, host, openai_api_key, model_name, temperature, num_relate_docs, embedding_model_name, persist_directory):
	"""Khởi tạo chuỗi RAG với ConversationalRetrievalChain"""
	llm = get_llm(port, host, openai_api_key, model_name, temperature)
	retriever = get_retriever(num_relate_docs, embedding_model_name, persist_directory)

	# Thêm bộ nhớ hội thoại
	memory = ConversationBufferMemory(
		memory_key="chat_history",
		return_messages=True
	)

	# Tạo ConversationalRetrievalChain
	chain = ConversationalRetrievalChain.from_llm(
		llm=llm,
		retriever=retriever,
		memory=memory,
		verbose=True
	)

	return chain, memory

if __name__ == "__main__":
	chain, memory = get_chain(
		port=2026,
		host="127.0.0.1",
		openai_api_key="chwenjun225",
		model_name="1_finetuned_DeepSeek-R1-Distill-Qwen-1.5B_finetune_CoT_ReAct",
		temperature=0,
		num_relate_docs=3,
		embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
		persist_directory="/home/chwenjun225/Projects/Foxer/ai_agentic/chroma_db"
	)

	while True:
		user_input = input(">>> 👨‍💻 User: ")
		if user_input.lower() == "exit":
			break

		response = chain(
			{"question": user_input, "chat_history": memory.chat_memory.messages}
		)
		assistant_response = response["answer"]

		print(">>> 🤖 Assistant: ", assistant_response)

		# Ghi log cuộc hội thoại
		log_chat(user_input, assistant_response)


# Thêm tài liệu vào `chromadb_storage`

In [None]:
from datetime import datetime
from langchain_huggingface.embeddings import HuggingFaceEmbeddings



emb_fn = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



collection = client.create_collection(
	name="foxconn_ai_research", 
	embedding_function=emb_fn,
	metadata={
		"description": "my first Chroma collection",
		"created": str(datetime.now())
	}  
)

# Xóa toàn bộ database với `reset()`

In [None]:
confirm = input("Bạn có chắc chắn muốn reset toàn bộ database? (yes/no): ")
if confirm.lower() == "yes":
	client.reset()
	print("ChromaDB database đã được reset thành công!")
else:
	print("Hủy bỏ reset database.")

# Kiểm tra database trước khi `reset()`

In [None]:
collections = client.list_collections()
if collections:
	print("Collections hiện có trong database:")
	for col in collections:
		print(f" - {col.name}")
else:
	print("Database hiện đang trống.")

confirm = input("⚠️ Bạn có chắc chắn muốn reset database? (yes/no): ")
if confirm.lower() == "yes":
	client.reset()
	print("Database đã được reset!")
else:
	print("Hủy bỏ reset database.")