In [2]:
import yaml
import add_packages
from pprint import pprint
import os
import pandas as pd
from tqdm.auto import tqdm

from toolkit.langchain import (
	document_loaders, text_splitters, text_embedding_models, stores, 
	prompts, utils, output_parsers, agents, documents, models,
	runnables, tools, chains
)

from toolkit import sql

PATH_DATA = f"{add_packages.APP_PATH}/data/vtc"
FILE_CFG = "vtc.yaml"
tqdm.pandas(desc="Processing")

with open(f"{add_packages.APP_PATH}/my_configs/{FILE_CFG}", 'r') as file:
    configs = yaml.safe_load(file)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
llm = models.create_llm(provider="openai", version="gpt-4o-mini")
embeddings = text_embedding_models.OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore = stores.faiss.FAISS

In [7]:
# my_sql_db = sql.MySQLDatabase()
my_sql_db = sql.MySQLDatabase(
	dbname=os.getenv("SQL_DB_NEON"),
	host=os.getenv("SQL_HOST_NEON"),
	port=os.getenv("SQL_PORT_NEON"),
	user=os.getenv("SQL_USER_NEON"),
	password=os.getenv("SQL_PASSWORD_NEON"),
)

# Data

## txt - FAQ

### vtc_faq

In [None]:
path_txt = f"{PATH_DATA}/vtc_faq.txt"

In [None]:
loader_txt = document_loaders.TextLoader(path_txt)
doc_txt = loader_txt.load()

text_splitter = text_splitters.RecursiveCharacterTextSplitter(
	# chunk_size=500, chunk_overlap=100,
	separators=["##"], chunk_size=150, chunk_overlap=0,
)
docs_txt = text_splitter.split_documents(doc_txt)
docs_txt = docs_txt[1:]

metadatas = {
	"data": "frequently asked questions"
}
utils.remove_metadata(docs_txt, "source")
utils.update_metadata(docs_txt, metadatas)

In [None]:
docs_txt_vtc_faq = docs_txt

### onli_faq

In [None]:
path_txt = f"{PATH_DATA}/onli_faq.txt"

In [None]:
loader_txt = document_loaders.TextLoader(path_txt)
doc_txt = loader_txt.load()

text_splitter = text_splitters.RecursiveCharacterTextSplitter(
	# chunk_size=500, chunk_overlap=100,
	separators=["##"], chunk_size=150, chunk_overlap=0,
)
docs_txt = text_splitter.split_documents(doc_txt)
docs_txt = docs_txt[1:]

metadatas = {
	"data": "frequently asked questions"
}
utils.remove_metadata(docs_txt, "source")
utils.update_metadata(docs_txt, metadatas)

In [None]:
docs_txt_onli_faq = docs_txt

### File 3

In [None]:
path_txt = f"{PATH_DATA}/faq.txt"

In [None]:
loader_txt = document_loaders.TextLoader(path_txt)
doc_txt = loader_txt.load()

text_splitter = text_splitters.RecursiveCharacterTextSplitter(
	# chunk_size=500, chunk_overlap=100,
	separators=["##"], chunk_size=150, chunk_overlap=0,
)
docs_txt = text_splitter.split_documents(doc_txt)
docs_txt = docs_txt[1:]

metadatas = {
	"data": "frequently asked questions"
}
utils.remove_metadata(docs_txt, "source")
utils.update_metadata(docs_txt, metadatas)

## table

### onli_course_list

In [None]:
file_csv = "onli_course_list.xlsx"
path_csv = f"{PATH_DATA}/{file_csv}"
path_csv_processed = f"{PATH_DATA}/{file_csv.split('.')[0]}2.xlsx"

In [None]:
df = pd.read_excel(
	path_csv, 
 	# delimiter="," # "," ";"
)

df.head()

#### Process

In [None]:
model = models.chat_openai

template1 = """\
Here is a description of a learning course:

<course_description>
{{COURSE_DESCRIPTION}}
</course_description>

Please read the course description carefully. Your task is to identify and extract ONLY the core learning objectives and outcomes that a student should expect from taking this course. Ignore any extraneous information beyond the key things the student will learn and take away.

Summarize the core information you have extracted in a single short paragraph. 

Provide your output in Vietnamese.

Again, please ONLY include information directly related to the main things the student will learn and take away from this course based on the description. Do not include any other details. Focus exclusively on the core learning objectives and outcomes and nothing else.
{text}"""

template2 = """\
...
{text}
"""

prompt_template1 = prompts.PromptTemplate.from_template(template1)
prompt_template2 = prompts.PromptTemplate.from_template(template2)

chain1 = prompt_template1 | model | output_parsers.StrOutputParser()
chain2 = prompt_template2 | model | output_parsers.StrOutputParser()

# chain = runnables.RunnablePassthrough.assign(
#   text=chain1
# ).assign(
#   text=chain2
# )

chain = runnables.RunnablePassthrough.assign(
  text=chain1
)

def process_csv_col(text: str) -> str:
  result = chain.invoke({"text": text})['text']
  return result

def capitalize_first_letter(s):
	return ' '.join([word.capitalize() for word in s.split()])

def change_col_value(df: pd.DataFrame, column_name: str, value, new_value):
	df[column_name] = df[column_name].replace(value, new_value)
	return df

def replace_col_value_if_contains(df, column_name, substring, new_substring):
	df[column_name] = df[column_name].str.replace(substring, new_substring)
	return df

# query = '...'
# result = process_csv_col(query)
# pprint(result)

In [None]:
col_to_process = "course_description"

df[col_to_process] = df[col_to_process].progress_apply(process_csv_col)

# df.to_excel(f"{path_csv_processed}", index=False)


In [None]:
df.to_excel(f"{path_csv_processed}", index=False)


In [None]:
path_csv = path_csv_processed

#### Load

In [None]:
path_csv

In [None]:
csv_cols = list(df.columns)

loader_csv = document_loaders.UnstructuredExcelLoader(
	path_csv,
	mode="elements"
)
docs_csv = loader_csv.load_and_split()

In [None]:
docs_csv = loader_csv.load()[1:]

metadatas = {
	"data": "..."
}

utils.remove_metadata(docs_csv, "source")
utils.remove_metadata(docs_csv, "row")
utils.update_metadata(docs_csv, metadatas)

#### Load to sql

In [None]:
my_table_schema = [
	"course_id SERIAL",
	"course_name VARCHAR(255) NOT NULL UNIQUE",
	"course_category VARCHAR(255) NOT NULL",
	"instructor_name VARCHAR(100) NOT NULL",
	"course_link VARCHAR(2048) NOT NULL UNIQUE",
	"course_description TEXT NOT NULL",
	"PRIMARY KEY (course_id)",
]
my_table = sql.MySQLTable(
	name="onli_course_list", 
	schema=my_table_schema,
	db=my_sql_db,
)
my_table.create()

db = stores.SQLDatabase.from_uri(my_sql_db.get_uri())
llm = models.chat_openai

embeddings = text_embedding_models.OpenAIEmbeddings()
vectorstore = stores.faiss.FAISS

table_cols = [col_description.split(" ")[0] for col_description in my_table_schema][1:-1]


In [None]:
df = pd.read_excel(path_csv)
df.columns = table_cols

# my_table.insert_from_dataframe(df)

In [None]:
my_table.insert_from_dataframe(df)


In [None]:
cols = ["course_name", "course_category", "instructor_name"]
proper_nouns_onli_course_list = [value for col in cols for value in my_table.get_discrete_values_col(col)]

In [None]:
questions = [
  "Which courses are available in the Design category?",
	"Who are the instructors for the Personal Development courses?",
	"Can you provide a summary of the course descriptions for the Digital Marketing category?",
	"Which courses have the longest descriptions?",
	"How many courses are offered by each instructor?",
	"Which course categories have the most number of courses?",
	"Can you tell me about the course 'Kỹ năng quản lý thời gian'?",
	"Which courses have the shortest descriptions?",
	"Which instructors have the most number of courses?",
	"Can you list all the courses that are related to Personal Development?"
]

examples_questions_to_sql_onli_course_list = [
    {
        "input": "Which courses are available in the Design category?",
        "query": "SELECT course_name FROM courses WHERE course_category = 'Design';"
    },
    {
        "input": "Who are the instructors for the Personal Development courses?",
        "query": "SELECT DISTINCT instructor_name FROM courses WHERE course_category = 'Personal Development';"
    },
    {
        "input": "Can you provide a summary of the course descriptions for the Digital Marketing category?",
        "query": "SELECT course_name, SUBSTRING(course_description, 1, 100) AS summary FROM courses WHERE course_category = 'Digital Marketing';"
    },
    {
        "input": "Which courses have the longest descriptions?",
        "query": "SELECT course_name, LENGTH(course_description) AS description_length FROM courses ORDER BY description_length DESC LIMIT 5;"
    },
    {
        "input": "How many courses are offered by each instructor?",
        "query": "SELECT instructor_name, COUNT(course_id) AS num_courses FROM courses GROUP BY instructor_name;"
    },
    {
        "input": "Which course categories have the most number of courses?",
        "query": "SELECT course_category, COUNT(course_id) AS num_courses FROM courses GROUP BY course_category ORDER BY num_courses DESC LIMIT 5;"
    },
    {
        "input": "Can you tell me about the course 'Kỹ năng quản lý thời gian'?",
        "query": "SELECT * FROM courses WHERE course_name = 'Kỹ năng quản lý thời gian';"
    },
    {
        "input": "Which courses have the shortest descriptions?",
        "query": "SELECT course_name, LENGTH(course_description) AS description_length FROM courses ORDER BY description_length ASC LIMIT 5;"
    },
    {
        "input": "Which instructors have the most number of courses?",
        "query": "SELECT instructor_name, COUNT(course_id) AS num_courses FROM courses GROUP BY instructor_name ORDER BY num_courses DESC LIMIT 5;"
    },
    {
        "input": "Can you list all the courses that are related to Personal Development?",
        "query": "SELECT course_name FROM courses WHERE course_category = 'Personal Development';"
    }
]

### File vtc_course_list

In [None]:
file_xlsx = "vtc_course_list.xlsx"
path_xlsx = f"{PATH_DATA}/{file_xlsx}"
path_xlsx_processed = f"{PATH_DATA}/{file_xlsx.split('.')[0]}-prod.xlsx"

In [None]:
df = pd.read_excel(
	path_xlsx, 
 	# delimiter=";"
)

df.head()

#### Process

In [None]:
model = models.chat_openai

template1 = """\
...
{text}"""

template2 = """\
...
{text}
"""

prompt_template1 = prompts.PromptTemplate.from_template(template1)
prompt_template2 = prompts.PromptTemplate.from_template(template2)

chain1 = prompt_template1 | model | output_parsers.StrOutputParser()
chain2 = prompt_template2 | model | output_parsers.StrOutputParser()

chain = runnables.RunnablePassthrough.assign(
  text=chain1
).assign(
  text=chain2
)

def process_xlsx_col(text: str) -> str:
  result = chain.invoke({"text": text})['text']
  return result

query = '...'
result = process_xlsx_col(query)

pprint(result)

In [None]:
df = pd.read_excel(path_xlsx)

col_to_process = "..."

df[col_to_process] = df[col_to_process].progress_apply(process_xlsx_col)

df.to_excel(f"{path_xlsx_processed}", index=False)


In [None]:
path_xlsx = path_xlsx_processed

#### Load

In [None]:
xlsx_cols = ...

loader_xlsx = document_loaders.UnstructuredExcelLoader(
	path_xlsx,
	mode="elements",
)
docs_xlsx = loader_xlsx.load()

metadatas = {
	"data": "..."
}

utils.remove_metadata(docs_xlsx, "source")
utils.remove_metadata(docs_xlsx, "row")
utils.update_metadata(docs_xlsx, metadatas)

#### Load to sql

In [None]:
# Khối ngành	Chuyên ngành	Chương trình	Link tham khảo	Hình thức học	Độ dài	Thời gian	Chỉ tiêu học viên	Đối tượng tuyển sinh	Hình thức tuyển sinh	Lịch khai giảng	Giới thiệu khoá học	Đầu ra khoá học	Yêu cầu	Học phí

In [None]:
my_table_schema = [
	"id SERIAL",
	"category VARCHAR(255) NOT NULL",
	"academic_program VARCHAR(255) NOT NULL",
	"program_type VARCHAR(255) NOT NULL",
	"reference_link VARCHAR(2048) NOT NULL",
	"program_format VARCHAR(255) NOT NULL",
	"program_duration VARCHAR(255) NOT NULL",
	"program_time VARCHAR(128) NOT NULL",
	"no_enrollments TEXT",
	"admission_targets TEXT",
	"admission_form TEXT",
	"program_schedule TEXT",
	"program_introduction TEXT",
	"program_outcomes TEXT",
	"requirements TEXT",
	"tuition TEXT",
	"PRIMARY KEY (id)",
	"training_roadmap TEXT"
]

my_table = sql.MySQLTable(
	name="vtc_course_list", 
	schema=my_table_schema,
	db=my_sql_db,
)
my_table.create()

db = stores.SQLDatabase.from_uri(my_sql_db.get_uri())
llm = models.chat_openai

embeddings = text_embedding_models.OpenAIEmbeddings()
vectorstore = stores.faiss.FAISS

table_cols = [col_description.split(" ")[0] for col_description in my_table_schema][1:-1]


In [None]:
df = pd.read_excel(path_xlsx)
df.columns = table_cols

In [None]:
# my_table.insert_from_dataframe(df)

In [None]:
cols = [
  "category", "academic_program", "program_type", "program_format", 
  "program_duration", "program_time"
]
proper_nouns_vtc_course_list = [value for col in cols for value in my_table.get_discrete_values_col(col)]

### FINAL SQL TABLE

In [None]:
examples_questions_to_sql = []
proper_nouns = []

proper_nouns.extend(value for value in proper_nouns_onli_course_list)
proper_nouns.extend(value for value in proper_nouns_vtc_course_list)

# Vector store 

Note:
- `tiktoken` >= 0.6.0

## txt

### vtc_faq

In [8]:
qdrant_txt_vtc_faq = stores.QdrantStore(
  embeddings_provider="openai",
	embeddings_model="text-embedding-3-large",
	llm=models.chat_openai,
	search_type="mmr",
  configs=configs,
  distance="Cosine",
  **configs["vector_db"]["qdrant"]["vtc_faq"]
)

[32m2024-08-06 09:03:37.606[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36mcreate_embeddings[0m:[36m80[0m - [1m[Embeddings] openai. Model: text-embedding-3-large[0m
[32m2024-08-06 09:03:39.155[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36msetup_collection[0m:[36m310[0m - [1m[Qdrant] Collection `vtc_faq` created.[0m
[32m2024-08-06 09:03:39.158[0m | [1mINFO    [0m | [36mtoolkit.langchain.stores[0m:[36mcreate_retriever[0m:[36m174[0m - [1m[Retrievers] ['base', 'MultiQueryRetriever'][0m


In [None]:
# qdrant_txt_vtc_faq.add_documents(docs_txt_vtc_faq)

### onli_faq

In [None]:
qdrant_txt_onli_faq = stores.QdrantStore(
  embeddings_provider="openai",
	embeddings_model="text-embedding-3-large",
	llm=models.chat_openai,
	search_type="mmr",
  configs=configs,
  distance="Cosine",
  **configs["vector_db"]["qdrant"]["onli_faq"]
)

In [None]:
# qdrant_txt_onli_faq.add_documents(docs_txt_onli_faq)

### File 3

In [None]:
# qdrant_txt = stores.QdrantStore(
#   qdrant_host=os.getenv("QDRANT_HOST"),
#   qdrant_api_key=os.getenv("QDRANT_API_KEY"),
#   configs=configs,
#   **configs["vector_db"]["qdrant"]["..."]
# )

In [None]:
# qdrant_txt.add_documents(docs_txt)

# Test

In [None]:
my_chain_rag_vtc_faq = chains.MyRagChain(
	llm=llm,
	retriever=qdrant_txt_vtc_faq.retriever,
	is_debug=False,
	just_return_ctx=True,
	**configs["vector_db"]["qdrant"]["vtc_faq"],
)

tool_chain_rag_vtc_faq = my_chain_rag_vtc_faq.create_tool_chain_rag()

my_chain_rag_onli_faq = chains.MyRagChain(
	llm=llm,
	retriever=qdrant_txt_onli_faq.retriever,
	is_debug=False,
	just_return_ctx=True,
	**configs["vector_db"]["qdrant"]["onli_faq"],
)

tool_chain_rag_onli_faq = my_chain_rag_onli_faq.create_tool_chain_rag()

In [None]:
examples_fewshot_tmp = dict(configs["sql"]["examples_questions_to_sql"]).values()
examples_questions_to_sql = [example for sublist in examples_fewshot_tmp for example in sublist]

proper_nouns = configs["sql"]["proper_nouns"]

# my_sql_db = sql.MySQLDatabase()
my_sql_db = sql.MySQLDatabase(
	dbname=os.getenv("SQL_DB_NEON"),
	host=os.getenv("SQL_HOST_NEON"),
	port=os.getenv("SQL_PORT_NEON"),
	user=os.getenv("SQL_USER_NEON"),
	password=os.getenv("SQL_PASSWORD_NEON"),
)

cfg_sql = configs["sql"]
cfg_sql_tool = cfg_sql["tool"]

my_sql_chain = chains.MySqlChain(
	my_sql_db=my_sql_db,
	llm=llm,
	embeddings=embeddings,
	vectorstore=vectorstore,
	proper_nouns=proper_nouns,
	k_retriever_proper_nouns=4,
	examples_questions_to_sql=examples_questions_to_sql,
	k_few_shot_examples=5,
	sql_max_out_length=2000,
	is_sql_get_all=True,
	is_debug=False,
	tool_name=cfg_sql_tool["name"],
	tool_description=cfg_sql_tool["description"],
	tool_metadata=cfg_sql_tool["metadata"],
	tool_tags=cfg_sql_tool["tags"],
)

tool_chain_sql = my_sql_chain.create_tool_chain_sql()

In [None]:
llm = models.chat_openai

tools = [
	tool_chain_rag_vtc_faq,
	tool_chain_rag_onli_faq,
	tool_chain_sql,
]

system_message_custom = configs["prompts"]["system_message_vtc"]
prompt = prompts.create_prompt_tool_calling_agent(system_message_custom)

agent = agents.MyStatelessAgent(
	llm=llm,
	tools=tools,
	prompt=prompt,
	agent_type="tool_calling",
	agent_verbose=False,
)

In [None]:
questions = [
]

In [None]:
res = []
async for chunk in agent.astream_events_basic(
	# "Xin chào",
	# "VTCA có hỗ trợ tìm kiếm việc làm không",
	"Các giảng viên dạy về lập trình ở Onlninica",
  show_tool_call=True,
  history_type="mongodb",
  user_id="test",
	session_id="test",
):
	print(chunk, end="", flush=True)

	res.append(chunk)

In [None]:
# result = my_sql_chain.chain_sql.invoke({
# 	"question": 
#    	"Các chương trình học tiêu chuẩn tại VTCA",
# })

# result

# Todos

- [ ] Deep dive into chain