In [1]:
import yaml
import add_packages
from pprint import pprint
import os, re
import pandas as pd
# import tqdm
from tqdm.auto import tqdm

from toolkit.langchain import (
	document_loaders, text_splitters, text_embedding_models, stores, 
	prompts, utils, output_parsers, agents, documents,
	runnables, agent_tools
)

PATH_DATA = f"{add_packages.APP_PATH}/data/tdtu/FEEE"
FILE_CFG = "tdtu.yaml"
tqdm.pandas(desc="Processing")

with open(f"{add_packages.APP_PATH}/my_configs/{FILE_CFG}", 'r') as file:
	configs = yaml.safe_load(file)

  from .autonotebook import tqdm as notebook_tqdm


# Data

## txt

### ThongTinChung

In [2]:
path_txt = f"{PATH_DATA}/ThongTinChung.txt"

In [3]:
loader_txt = document_loaders.TextLoader(path_txt)
doc_txt = loader_txt.load()

text_splitter = text_splitters.RecursiveCharacterTextSplitter(
	# chunk_size=500, chunk_overlap=100,
	separators=["##"], chunk_size=1000, chunk_overlap=0,
)
docs_txt = text_splitter.split_documents(doc_txt)
docs_txt = docs_txt[1:]

metadatas = {
	"data": "general information"
}
utils.remove_metadata(docs_txt, "source")
utils.update_metadata(docs_txt, metadatas)

In [4]:
docs_txt_thongtinchung = docs_txt

### File 2

In [None]:
path_txt = f"{PATH_DATA}/faq.txt"

In [None]:
loader_txt = document_loaders.TextLoader(path_txt)
doc_txt = loader_txt.load()

text_splitter = text_splitters.RecursiveCharacterTextSplitter(
	# chunk_size=500, chunk_overlap=100,
	separators=["##"], chunk_size=150, chunk_overlap=0,
)
docs_txt = text_splitter.split_documents(doc_txt)
docs_txt = docs_txt[1:]

metadatas = {
	"data": "frequently asked questions"
}
utils.remove_metadata(docs_txt, "source")
utils.update_metadata(docs_txt, metadatas)

## csv

### NhanSu

In [5]:
file_csv = "NhanSu.csv"
path_csv = f"{PATH_DATA}/{file_csv}"
path_csv_processed = f"{PATH_DATA}/{file_csv.split('.')[0]}1.csv"

#### Process

In [None]:
model = models.chat_openai

template1 = """\
...
{text}"""

template2 = """\
...
{text}
"""

prompt_template1 = prompts.PromptTemplate.from_template(template1)
prompt_template2 = prompts.PromptTemplate.from_template(template2)

chain1 = prompt_template1 | model | output_parsers.StrOutputParser()
chain2 = prompt_template2 | model | output_parsers.StrOutputParser()

chain = runnables.RunnablePassthrough.assign(
  text=chain1
).assign(
  text=chain2
)

def process_csv_col(text: str) -> str:
  result = chain.invoke({"text": text})['text']
  return result

query = '...'
result = process_csv_col(query)

pprint(result)

In [None]:
df = pd.read_csv(path_csv)

col_to_process = "..."

df[col_to_process] = df[col_to_process].progress_apply(process_csv_col)

df.to_csv(f"{PATH_DATA}/{path_csv_processed}", index=False)


In [None]:
path_csv = path_csv_processed

#### Load

In [6]:
csv_cols = [
  "Nhân sự", "Chức vụ", "Bộ môn", 'Email', "Phòng làm việc", "Nhóm"
]

loader_csv = document_loaders.CSVLoader(
	path_csv,
	# source_column="No",
	csv_args={
		"delimiter": ";", # ",", ";"
		# "quotechar": "''",
		"fieldnames": csv_cols,
	},
)
docs_csv = loader_csv.load()[1:]

metadatas = {
	"data": "Nhân sự, Giảng viên"
}

utils.remove_metadata(docs_csv, "source")
utils.remove_metadata(docs_csv, "row")
utils.update_metadata(docs_csv, metadatas)

In [7]:
docs_csv_nhansu = docs_csv

### ChuongTrinhDaoTao

In [8]:
file_csv = "ChuongTrinhDaoTao.csv"
path_csv = f"{PATH_DATA}/{file_csv}"
path_csv_processed = f"{file_csv.split('.')[0]}1.csv"

In [None]:
df = pd.read_csv(
	path_csv, delimiter=";"
)

df.head()

#### Process

In [None]:
model = models.chat_openai

template1 = """\
...
{text}"""

template2 = """\
...
{text}
"""

prompt_template1 = prompts.PromptTemplate.from_template(template1)
prompt_template2 = prompts.PromptTemplate.from_template(template2)

chain1 = prompt_template1 | model | output_parsers.StrOutputParser()
chain2 = prompt_template2 | model | output_parsers.StrOutputParser()

chain = runnables.RunnablePassthrough.assign(
  text=chain1
).assign(
  text=chain2
)

def process_csv_col(text: str) -> str:
  result = chain.invoke({"text": text})['text']
  return result

query = '...'
result = process_csv_col(query)

pprint(result)

In [None]:
df = pd.read_csv(path_csv)

col_to_process = "..."

df[col_to_process] = df[col_to_process].progress_apply(process_csv_col)

df.to_csv(f"{PATH_DATA}/{path_csv_processed}", index=False)


In [None]:
path_csv = path_csv_processed

#### Load

In [9]:
csv_cols = [
	"Ngành", "Chương trình", "Hệ đào tạo", "Link", "Liên kiết", "Giới thiệu", 
	"Chuẩn đầu ra", "Cơ hội việc làm", "Hướng nghiên cứu", "Tuyển sinh",
]

loader_csv = document_loaders.CSVLoader(
	path_csv,
	# source_column="No",
	csv_args={
		"delimiter": ";", # ",", ";"
		# "quotechar": "''",
		"fieldnames": csv_cols,
	},
)
docs_csv = loader_csv.load()[1:]

metadatas = {
	"data": "..."
}

utils.remove_metadata(docs_csv, "source")
utils.remove_metadata(docs_csv, "row")
utils.update_metadata(docs_csv, metadatas)

In [10]:
docs_csv_chuongtrinhdaodao = docs_csv

### File 3

In [None]:
file_csv = "..."
path_csv = f"{PATH_DATA}/{file_csv}"
path_csv_processed = f"{file_csv.split(".")[0]}1.csv"

#### Process

In [None]:
model = models.chat_openai

template1 = """\
...
{text}"""

template2 = """\
...
{text}
"""

prompt_template1 = prompts.PromptTemplate.from_template(template1)
prompt_template2 = prompts.PromptTemplate.from_template(template2)

chain1 = prompt_template1 | model | output_parsers.StrOutputParser()
chain2 = prompt_template2 | model | output_parsers.StrOutputParser()

chain = runnables.RunnablePassthrough.assign(
  text=chain1
).assign(
  text=chain2
)

def process_csv_col(text: str) -> str:
  result = chain.invoke({"text": text})['text']
  return result

query = '...'
result = process_csv_col(query)

pprint(result)

In [None]:
df = pd.read_csv(path_csv)

col_to_process = "..."

df[col_to_process] = df[col_to_process].progress_apply(process_csv_col)

df.to_csv(f"{PATH_DATA}/{path_csv_processed}", index=False)


In [None]:
path_csv = path_csv_processed

#### Load

In [None]:
csv_cols = utils.get_csv_column_names(path_csv)

loader_csv = document_loaders.CSVLoader(
	path_csv,
	# source_column="No",
	csv_args={
		"delimiter": ",", # ",", ";"
		# "quotechar": "''",
		"fieldnames": csv_cols,
	},
)
docs_csv = loader_csv.load()

metadatas = {
	"data": "..."
}

utils.remove_metadata(docs_csv, "source")
utils.remove_metadata(docs_csv, "row")
utils.upd# Vector store 
## csv
### File 1
collection_csv = "..."

qdrant_csv = stores.QdrantWrapper(
	qdrant_host=os.getenv("QDRANT_HOST"),
	qdrant_api_key=os.getenv("QDRANT_API_KEY"),
	configs=configs,
	**configs["vector_db"]["qdrant"][collection_csv]
)
qdrant_csv.add_documents(docs_csv)
### File 2
collection_csv = "..."

qdrant_csv = stores.QdrantWrapper(
	qdrant_host=os.getenv("QDRANT_HOST"),
	qdrant_api_key=os.getenv("QDRANT_API_KEY"),
	configs=configs,
	**configs["vector_db"]["qdrant"][collection_csv]
)
qdrant_csv.add_documents(docs_csv)
## txt
### File 1
collection_txt = "faq"

qdrant_txt = stores.QdrantWrapper(
  qdrant_host=os.getenv("QDRANT_HOST"),
  qdrant_api_key=os.getenv("QDRANT_API_KEY"),
  configs=configs,
  **configs["vector_db"]["qdrant"][collection_txt]
)
qdrant_txt.add_documents(docs_txt)
### File 2
collection_txt = "faq"

qdrant_txt = stores.QdrantWrapper(
  qdrant_host=os.getenv("QDRANT_HOST"),
  qdrant_api_key=os.getenv("QDRANT_API_KEY"),
  configs=configs,
  **configs["vector_db"]["qdrant"][collection_txt]
)
# qdrant_txt.add_documents(docs_txt)ate_metadata(docs_csv, metadatas)

# Vector store 

## csv

### NhanSu

In [18]:
qdrant_csv_personnel = stores.QdrantWrapper(
	qdrant_host=os.getenv("QDRANT_HOST"),
	qdrant_api_key=os.getenv("QDRANT_API_KEY"),
	configs=configs,
	**configs["vector_db"]["qdrant"]["personnel"]
)

[32m2024-05-28 13:53:17.810[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36m__init__[0m:[36m94[0m - [1mFound collection: `tdtu-personnel`.[0m
[32m2024-05-28 13:53:17.811[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36m__init__[0m:[36m107[0m - [1m`tdtu-personnel` - Embeddings: openai - {'model': 'text-embedding-3-large'}, 3072[0m
[32m2024-05-28 13:53:17.852[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36m__init__[0m:[36m126[0m - [1m`tdtu-personnel` - Retriever: Vectorstore[0m


In [12]:
qdrant_csv_personnel.add_documents(docs_csv_nhansu)

100%|██████████| 41/41 [00:46<00:00,  1.13s/it]


### ChuongTrinhDaoTao

In [13]:
qdrant_csv_admission = stores.QdrantWrapper(
	qdrant_host=os.getenv("QDRANT_HOST"),
	qdrant_api_key=os.getenv("QDRANT_API_KEY"),
	configs=configs,
	**configs["vector_db"]["qdrant"]["university_admission_program"]
)

[32m2024-05-28 13:43:35.398[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36m__init__[0m:[36m103[0m - [1mCollection: `tdtu-university_admission_program` created.[0m
[32m2024-05-28 13:43:35.399[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36m__init__[0m:[36m107[0m - [1m`tdtu-university_admission_program` - Embeddings: openai - {'model': 'text-embedding-3-large'}, 3072[0m
[32m2024-05-28 13:43:35.458[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36m__init__[0m:[36m126[0m - [1m`tdtu-university_admission_program` - Retriever: Vectorstore[0m


In [14]:
qdrant_csv_admission.add_documents(docs_csv_chuongtrinhdaodao)

100%|██████████| 18/18 [00:24<00:00,  1.36s/it]


### File 2

In [None]:
collection_csv = "..."

qdrant_csv = stores.QdrantWrapper(
	qdrant_host=os.getenv("QDRANT_HOST"),
	qdrant_api_key=os.getenv("QDRANT_API_KEY"),
	configs=configs,
	**configs["vector_db"]["qdrant"][collection_csv]
)

## txt

### ThongTinChung

In [15]:
qdrant_txt_info = stores.QdrantWrapper(
  qdrant_host=os.getenv("QDRANT_HOST"),
  qdrant_api_key=os.getenv("QDRANT_API_KEY"),
  configs=configs,
  **configs["vector_db"]["qdrant"]["general_information"]
)

[32m2024-05-28 13:45:19.847[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36m__init__[0m:[36m103[0m - [1mCollection: `tdtu-general_information` created.[0m
[32m2024-05-28 13:45:19.848[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36m__init__[0m:[36m107[0m - [1m`tdtu-general_information` - Embeddings: openai - {'model': 'text-embedding-3-large'}, 3072[0m
[32m2024-05-28 13:45:19.895[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36m__init__[0m:[36m126[0m - [1m`tdtu-general_information` - Retriever: Vectorstore[0m


In [16]:
qdrant_txt_info.add_documents(docs_txt_thongtinchung)

100%|██████████| 11/11 [00:13<00:00,  1.19s/it]


### File 2

In [None]:
collection_txt = "general_information"

qdrant_faq = stores.QdrantWrapper(
  qdrant_host=os.getenv("QDRANT_HOST"),
  qdrant_api_key=os.getenv("QDRANT_API_KEY"),
  configs=configs,
  **configs["vector_db"]["qdrant"][collection_txt]
)

In [None]:
# qdrant_faq.add_documents(docs_txt)

# Test

In [22]:
llm = models.chat_openai

tools = [
	# tools.TavilySearchResults(max_results=3),
	qdrant_csv_admission.retriever_tool,
	qdrant_csv_personnel.retriever_tool,
	qdrant_txt_info.retriever_tool,
]

system_message_custom = configs["prompts"]["system_message_tdtu"]
prompt = prompts.create_prompt_tool_calling_agent(system_message_custom)

agent = agents.MyStatelessAgent(
	llm=llm,
	tools=tools,
	prompt=prompt,
	agent_type=configs["agents"]["type"],
	agent_verbose=True,
)

[32m2024-05-28 13:54:04.365[0m | [1mINFO    [0m | [36mtoolkit.langchain.agents[0m:[36m_create_agent[0m:[36m155[0m - [1mAgent type: tool_calling[0m


In [35]:
questions = [
	"Thành viên ban chủ nhiệm khoa điện",
	"Các ngành đào tạo chương trình tiêu chuẩn",
	"Thông điệp trưởng khoa",
	"Đồng Sĩ Thiên Châu",
]


In [36]:
input_message = questions[3]
# await agent.stream_conversable_agent(questions[2])
result = await agent.invoke_agent(
	input_message, user_id="admin", session_id="default",
)
# await agent.stream_agent(input_message)
pprint(result)

[32m2024-05-28 14:02:56.552[0m | [1mINFO    [0m | [36mtoolkit.langchain.agents[0m:[36m__init__[0m:[36m88[0m - [1mUser Id: admin[0m
[32m2024-05-28 14:02:56.552[0m | [1mINFO    [0m | [36mtoolkit.langchain.agents[0m:[36m__init__[0m:[36m89[0m - [1mSession Id: Timothy Singh-5ee5dfae[0m
[32m2024-05-28 14:02:56.553[0m | [1mINFO    [0m | [36mtoolkit.langchain.agents[0m:[36m__init__[0m:[36m90[0m - [1mHistory Type: dynamodb[0m




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `personnel` with `{'query': 'Đồng Sĩ Thiên Châu'}`


[0m[33;1m[1;3mNhân sự: TS. Đồng Sĩ Thiên Châu
Chức vụ: Phó Hiệu trưởng trường Đại học Tôn Đức Thắng
Bộ môn: Điều khiển tự động
Email: dongsythienchau@tdt.edu.vn
Phòng làm việc: Không
Nhóm: Bộ môn Điều khiển tự động

Nhân sự: Nguyễn Thị Thu Quyên
Chức vụ: Viên chức hành chính
Bộ môn: Hành Chính
Email: nguyenthithuquyen@tdtu.edu.vn
Phòng làm việc: C008
Nhóm: Bộ phận Hành Chính

Nhân sự: TS. Nguyễn Nhật Tân
Chức vụ: Giảng viên bộ môn Điện tử - Viễn thông
Bộ môn: Điện tử - Viễn thông
Email: nguyennhattan@tdtu.edu.vn
Phòng làm việc: C117
Nhóm: Bộ môn Điện tử - Viễn thông

Nhân sự: TS. Đinh Hoàng Bách
Chức vụ: Viện trưởng Viện hợp tác, nghiên cứu và đào tạo quốc tế, Trưởng Bộ môn Kĩ thuật Điện
Bộ môn: Kỹ Thuật Điện
Email: dinhhoangbach@tdtu.edu.vn
Phòng làm việc: C118
Nhóm: Bộ môn Kỹ thuật Điện

Nhân sự: ThS. Đặng Ngọc Khoa
Chức vụ: Giảng viên bộ môn Điện tử - Viễn 