In [1]:
# สร้าง AI Database Environment
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions



db = chromadb.Client(settings=Settings(
    persist_directory='ai_db_testcase',
    chroma_db_impl='duckdb+parquet',
    allow_reset=False
))

# Model Name หาได้จาก https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models/
embbed = embedding_functions.SentenceTransformerEmbeddingFunction(model_name='all-mpnet-base-v2')
collection_name = 'freewill'
collection = db.get_collection(name=collection_name) if collection_name in [collection.name for collection in db.list_collections()] else None
if collection == None:
    collection = db.get_or_create_collection(
        name=collection_name, 
        embedding_function=embbed,
        
        # https://github.com/nmslib/hnswlib/tree/master#python-bindings
        # #metadata={"hnsw:space": "l2"}
        metadata={"hnsw:space": "ip"}\
        #metadata={"hnsw:space": "cosine"}
    )

print(collection)
print(db.list_collections())

  from .autonotebook import tqdm as notebook_tqdm


name='freewill' id=UUID('9afcb2e7-66fb-45a6-a2b1-5c1d20349122') metadata={'hnsw:space': 'ip'}
[Collection(name=freewill)]


In [2]:
from helper.pdfHelper import customPdfLoader

pdf = customPdfLoader(file_path='source_documents-single\Cash Balance Deposit_vn.pdf')
docs = pdf.load()
print(docs)

[Document(page_content='Cash Balance Deposit : BRE039A  \nProgram Description    Program for adding deposit cash transactions  of customers . System will \nautomatically increase credit available on Trading System (iFIS ), after supervisors approve deposit \ntransactions  by using Confirm Transaction for Cash Movement Screen.  \n \n       \n \nAdd data  \n Click on”Add” button or and input the following details  \nField detail and description:  \n Field Name  Field Description  \n1. Receipt No.  Reference no. of company receive which is created by program  \n2. Account No.  Customer account no.  \n3. XChgMkt  Stock exchange market  ( 1 = STC,  9 = OTC)  \n4. Deposit Amount  Deposit amount   \n5. Transaction Type  Deposit type :  there are 3 types of deposit  \nCash :  the system will increase customer cash balance after supervisor approve \nthis transaction  \nCheque : the system will increase customer cash balance on the ‘payment date’ or \n‘clearing date’ (depend on company policy), 

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

doc = docs[0]
split_texts = re.split(pattern="\n{1,}|\r\n", string=doc.page_content)
split_texts = [re.sub('\s{2,}', ' ', text.strip()) for text in split_texts if len(text.strip()) > 0]
split_texts = [f'{text}\n' if text.endswith('.') else text for text in split_texts] # ทำ Markpoint ให้ LLM ทราบถึง End-of-sequence/Stop Sequence/Stop Generate Text/จุดสิ้นสุดของประโยค
cleaned_texts = ' '.join(split_texts)

# ตัดข้อความแยกออกเป็นแต่ล่ะ Chunk (แยกออกเป็นก้อนๆ)
# .split_text \n จะถูกตัดออกไปหาก เป็นคำสุดท้ายของ chunk พอดี
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
chunks = text_splitter.split_text(text=cleaned_texts)
chunks = [f'{chunk}\n' if chunk.endswith('.') else chunk for chunk in chunks] # Mark point End-Of-Sequence/Stop 
print(chunks)

['Cash Balance Deposit : BRE039A Program Description Program for adding deposit cash transactions of customers . System will automatically increase credit available on Trading System (iFIS ), after', 'credit available on Trading System (iFIS ), after supervisors approve deposit transactions by using Confirm Transaction for Cash Movement Screen.\n', 'Add data Click on”Add” button or and input the following details Field detail and description: Field Name Field Description 1. Receipt No. Reference no. of company receive which is created by program', 'no. of company receive which is created by program 2. Account No. Customer account no.\n', '3. XChgMkt Stock exchange market ( 1 = STC, 9 = OTC) 4. Deposit Amount Deposit amount 5. Transaction Type Deposit type : there are 3 types of deposit Cash : the system will increase customer cash', 'Cash : the system will increase customer cash balance after supervisor approve this transaction Cheque : the system will increase customer cash balance on

In [4]:
#db.reset()
print(doc.metadata['source'])
idx = 1
for chunk in chunks:
    collection.add(
            ids=[f'idx_{idx}'],
            documents=[chunk],
            metadatas=[{'id': f'idx_{idx}', 'chunk_context': chunk, 'source': doc.metadata['source']}]
        )
    idx += 1

source_documents-single\Cash Balance Deposit_vn.pdf


In [5]:
db.persist()

print(db.list_collections())
print(collection.count())
print(len(chunks))
print(collection)
print(collection.get()['metadatas'])
#db = None

[Collection(name=freewill)]
26
26
name='freewill' id=UUID('9afcb2e7-66fb-45a6-a2b1-5c1d20349122') metadata={'hnsw:space': 'ip'}
[{'id': 'idx_1', 'chunk_context': 'Cash Balance Deposit : BRE039A Program Description Program for adding deposit cash transactions of customers . System will automatically increase credit available on Trading System (iFIS ), after', 'source': 'source_documents-single\\Cash Balance Deposit_vn.pdf'}, {'id': 'idx_21', 'chunk_context': 'After user saved the transa ction, this transaction will send to supervisor for approve.\n', 'source': 'source_documents-single\\Cash Balance Deposit_vn.pdf'}, {'id': 'idx_3', 'chunk_context': 'Add data Click on”Add” button or and input the following details Field detail and description: Field Name Field Description 1. Receipt No. Reference no. of company receive which is created by program', 'source': 'source_documents-single\\Cash Balance Deposit_vn.pdf'}, {'id': 'idx_4', 'chunk_context': 'no. of company receive which is created 

In [7]:
chroma_db = chromadb.Client(settings=Settings(
    persist_directory='ai_db_testcase',
    chroma_db_impl='duckdb+parquet',
    allow_reset=False
))

# แสดง Collection ทั้งหมดที่อยู่ใน AI DB
print(chroma_db.list_collections())
# แสดง คุณสมบัติ (Property) ของ Collection 
print(chroma_db.get_collection(name=collection_name))
# แสดงข้อมูลของ Collection name 
print(chroma_db.get_collection(name=collection_name).get())


[Collection(name=freewill)]
name='freewill' id=UUID('48eb6f08-9154-438a-b4b8-81217942a7da') metadata={'hnsw:space': 'ip'}


In [27]:
query_str = 'What is the transaction type field used for?'
#query_str = 'What is the effective date field used for?'
#query_str = 'What is the cheque number field used for?'
#query_str = 'What is the cheque date field used for?'
#query_str = 'the branch field used for'
query_strs = query_str.split(sep=' ')
res = collection.query(
    #query_texts=query_str.split(sep=' '),
    #query_texts=[query_str] + query_strs,
    query_texts=[query_str],
    n_results=15,
    # where={
    #     'chunk_context':{
    #         '$eq': query_str
    #     }
    # },
    #where={'id': 'idx_11'}
    #where_document={'$contains': query_str}
)
print(res)
print(f"idx 0: {res['ids'][0][0]}")
#print(res['documents'][0])
print('\n\n'.join(res['documents'][0]))



# res2 = collection.get(
#     ids='idx_12'
# )
# print(res2)
