In [1]:
from langchain_community.document_loaders import DirectoryLoader, UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from unstructured.partition.pdf import partition_pdf
# from llm_client_chat import AlpacaLLM
from llm_client import AlpacaLLM
import time, os, shutil

os.environ['HF_HOME'] = './cache/'

In [2]:
CHROMA_PATH = "final_test/unstructured"
# CHROMA_PATH = "final_test/langchain"

DATA_PATH = "data/pdfs"

In [3]:
def load_embedding_model():
    start_time = time.time()
    model_path="intfloat/multilingual-e5-large"   
    encode_kwargs = {"normalize_embeddings": True}
    local_embedding = HuggingFaceEmbeddings(
        model_name=model_path,
        cache_folder="./models",
        encode_kwargs=encode_kwargs
    )
    end_time = time.time()
    print(f'model load time {round(end_time - start_time, 0)} second')
    return local_embedding

embedding = load_embedding_model()

  from .autonotebook import tqdm as notebook_tqdm


model load time 126.0 second


In [2]:
# use Reranking
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

retriever = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding).as_retriever(search_kwargs={"k": 20})

model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")
compressor = CrossEncoderReranker(model=model, top_n=3)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "Apa saja parameter penilaian status nutrisi dan pertumbuhan pada anak dengan PGK?"
)
answer_docs = [doc.page_content for doc in compressed_docs]
for ans in answer_docs:
    print(ans)
    print("-"*20)

NameError: name 'Chroma' is not defined

In [3]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank

retriever = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding).as_retriever(search_kwargs={"k": 20})

compressor = FlashrankRerank(model="ms-marco-TinyBERT-L-2-v2", top_n=5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "Apa saja parameter penilaian status nutrisi dan pertumbuhan pada anak dengan PGK?"
)
answer_docs = [doc.page_content for doc in compressed_docs]
for ans in answer_docs:
    print(ans)
    print("-"*20)

NameError: name 'Chroma' is not defined

In [4]:
def load_documents(path = DATA_PATH):
    loader = DirectoryLoader(path, 
                             glob="*.pdf",
                             loader_cls=UnstructuredPDFLoader)
    documents = loader.load()
    return documents

def split_text(documents: list[Document]):
    print("Starting chunking")
    start_time = time.time()

    "Character Splitter"
    # text_splitter = CharacterTextSplitter(
    # separator="\n\n",
    # chunk_size=1000,
    # chunk_overlap=300,
    # length_function=len,
    # strip_whitespace=True
    # )
    # chunks = text_splitter.split_documents(documents)

    "Recursive Splitter"
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)

    "Semantic Splitter"
    # text_splitter = SemanticChunker(embedding)
    # chunks = text_splitter.split_documents(documents)

    end_time = time.time()

    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    print(f"len docs {len(chunks)}")
    document = chunks[10]
    print("page content \n", document.page_content)
    print("doc metadata \n", document.metadata)
    print(f'chunking time {round(end_time - start_time, 0)} second')
    return chunks

def save_to_chroma(chunks: list[Document]):
    print("Starting Embedding")
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    start_time = time.time()
    db = Chroma.from_documents(
        chunks, 
        embedding, 
        persist_directory=CHROMA_PATH,
        collection_metadata={"hnsw:space": "cosine"}
    )
    db.persist()
    end_time = time.time()
    print(f'embedding time {round(end_time - start_time, 0)} second')
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [6]:
filename = f"data/pdfs/PNPK 2023 GGK.pdf"

In [5]:
# Extracts the elements from the PDF
elements = partition_pdf(
    filename=filename,

    # Unstructured Helpers
    strategy="hi_res", 
    infer_table_structure=True, 
    hi_res_model_name="yolox",
    extract_images_in_pdf=True,
    extract_image_block_output_dir="PNPK 2023 GGK-Images"

)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Get pictures
raw_pdf_elements = partition_pdf(
    filename=filename,
    
    # Using pdf format to find embedded image blocks
    
    
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
)

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


In [6]:
import re

clean_elements = []

pattern = r'\d+ -'

for element in elements:
    if not (re.match(pattern, element.__str__()) or element.__str__() == "jdih.kemkes.go.id"):
        clean_elements.append(element)
    

In [14]:
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title

chunks = chunk_elements(clean_elements, 
                        max_characters=1500, 
                        new_after_n_chars=1000, 
                        overlap=300, 
                        overlap_all=True)

chunks = chunk_by_title(clean_elements,
                        combine_text_under_n_chars = 500,
                        overlap=300, 
                        overlap_all=True)

clean_elements_str_lst = [element.__str__() for element in chunks]

if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

db = Chroma.from_texts(
        clean_elements_str_lst, 
        embedding, 
        persist_directory=CHROMA_PATH, 
        collection_metadata={"hnsw:space": "cosine"}
    )
db.persist()

In [5]:
documents = load_documents()

In [6]:
print(documents[0].metadata)

{'source': 'data\\pdfs\\nutrisi_pada_anak_dengan_penyakit_ginjal.pdf'}


In [None]:
chunks = split_text(documents)
save_to_chroma(chunks)

In [15]:
PROMPT_TEMPLATE = """
Jawablah pertanyaan di bawah hanya menggunakan informasi di bawah ini, masing-masing informasi dipisahkan oleh '---':

---

{context}

---

Jawab pertanyaan menggunakan informasi di atas: {question}
"""

In [27]:
chat = True
while chat:
    # Accept user input
    user_prompt = input("User > ")
    if user_prompt == "exit":
        chat = False
        continue
    else :
        pass
    
    # load DB.
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding)
    
    # Search the DB.
    results = db.similarity_search_with_relevance_scores(user_prompt, k=3)
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Similarity too low.", end="\n")
        # return

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=user_prompt)

    # print(prompt)
    counter = 1
    print("-----------------------------------------------------")
    print(user_prompt)
    for doc, _score in results:
        print(f"{counter}. doc: \n{doc.page_content}, \nmetadata: {doc.metadata}, \nscore: {_score}\n")
        counter+=1
    print("-----------------------------------------------------")
    print()

    # model = AlpacaLLM()
    # response_text = model.invoke(prompt)

    # # model = ChatOpenAI()
    # # response_text = model.predict(prompt)

    # sources = [doc.metadata.get("source", None) for doc, _score in results]
    # print()
    # formatted_response = f"AI > Response: {response_text}\nSources: {sources}"
    # # formatted_response = f"Response: {response_text}"
    # print(formatted_response)

-----------------------------------------------------
Apa komplikasi yang dapat terjadi pada anak dengan PGK?
1. doc: 
lah kelainan ginjal akibat infeksi atau kelainan yang didapat, di mana anak dirujuk saat sudah pada stadium lanjut dari PGK. Data epidemiologi anak PGK di negara berkembang tidak pasti dan tidak mencerminkan kondisi sesungguhnya karena biasanya pendataan baru dimulai saat anak sudah mulai dialisis. PGK pada anak mempunyai karakteristik dan tantangan yang khas dan unik yang tidak dijumpai pada pasien PGK dewasa. Masalah khusus yang dijumpai pada PGK anak antara lain gangguan pertumbuhan dan nutrisi serta masalah psikososial yang mempengaruhi kualitas hidup anak secara bermakna, termasuk fungsi ginjalnya. Penanganan pasien PGK anak harus memikirkan masa depannya, sedangkan pada dewasa justru harus memikirkan masa lalu riwayat hidup sebelumnya. Oleh karena itu berbagai organisasi internasional seperti The Centers for Disease Control and Prevention (CDC) dan World Health O